diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 1a401997c64..2f3df7cda9c 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -4,7 +4,7 @@ https://stackoverflow.com/questions/tagged/tensorflow
 
 If you open a GitHub issue, here is our policy:
 
-1. It must be a bug or a feature request.
+1. It must be a bug, a feature request, or a significant problem with documentation (for small docs fixes please send a PR instead).
 2. The form below must be filled out.
 3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).
 
diff --git a/README.md b/README.md
index 0c93813e584..c754c3f0db0 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
diff --git a/RELEASE.md b/RELEASE.md
index fdf10407fda..af6440acef5 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,18 +1,39 @@
 # Release 1.5.0
 
 ## Breaking Changes
-* Prebuilt binaries are now built against CUDA 9 and cuDNN 7.
+* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
 * Our Linux binaries are built using ubuntu 16 containers, potentially
   introducing glibc incompatibility issues with ubuntu 14.
 * Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
   This may break TF on older CPUs.
 
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
 ## Major Features And Improvements
 * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
   preview version is now available.
 * [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
   dev preview is now available.
-* CUDA 9 and cuDNN 7 support.
+* CUDA 9.0 and cuDNN 7 support.
 * Accelerated Linear Algebra (XLA):
   * Add `complex64` support to XLA compiler.
   * `bfloat` support is now added to XLA infrastructure.
diff --git a/configure.py b/configure.py
index 94827891c36..318e2c7a575 100644
--- a/configure.py
+++ b/configure.py
@@ -298,7 +298,7 @@ def get_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
 
   Returns:
@@ -411,7 +411,7 @@ def set_action_env_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
   """
   var = int(
@@ -1354,6 +1354,7 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
@@ -1372,6 +1373,8 @@ def main():
                 'with_hdfs_support', True, 'hdfs')
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
+  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
+                'with_kafka_support', False, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index b26c5255255..c225cc1a74c 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -211,6 +211,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_kafka_support",
+    define_values = {"with_kafka_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between platforms and file system libraries not supported on those
 # platforms due to limitations in nested select() statements.
 config_setting(
@@ -544,8 +550,10 @@ filegroup(
         "//tensorflow/contrib/predictor:all_files",
         "//tensorflow/contrib/py2tf:all_files",
         "//tensorflow/contrib/py2tf/converters:all_files",
+        "//tensorflow/contrib/py2tf/impl:all_files",
         "//tensorflow/contrib/py2tf/pyct:all_files",
         "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
+        "//tensorflow/contrib/py2tf/utils:all_files",
         "//tensorflow/contrib/quantize:all_files",
         "//tensorflow/contrib/receptive_field:all_files",
         "//tensorflow/contrib/reduce_slice_ops:all_files",
diff --git a/tensorflow/SECURITY.md b/tensorflow/SECURITY.md
new file mode 100644
index 00000000000..074eed29515
--- /dev/null
+++ b/tensorflow/SECURITY.md
@@ -0,0 +1,239 @@
+# Using TensorFlow Securely
+
+This document discusses how to safely deal with untrusted programs (models or
+model parameters), and input data. Below, we also provide guidelines on how to
+report vulnerabilities in TensorFlow.
+
+## TensorFlow models are programs
+
+TensorFlow's runtime system interprets and executes programs. What machine 
+learning practitioners term
+[**models**](https://developers.google.com/machine-learning/glossary/#model) are
+expressed as programs that TensorFlow executes.  TensorFlow programs are encoded
+as computation
+[**graphs**](https://developers.google.com/machine-learning/glossary/#graph).
+The model's parameters are often stored separately in **checkpoints**.
+
+At runtime, TensorFlow executes the computation graph using the parameters
+provided. Note that the behavior of the computation graph may change
+depending on the parameters provided. TensorFlow itself is not a sandbox. When
+executing the computation graph, TensorFlow may read and write files, send and
+receive data over the network, and even spawn additional processes. All these
+tasks are performed with the permissions of the TensorFlow process. Allowing
+for this flexibility makes for a powerful machine learning platform,
+but it has implications for security.
+
+The computation graph may also accept **inputs**. Those inputs are the
+data you supply to TensorFlow to train a model, or to use a model to run
+inference on the data.
+
+**TensorFlow models are programs, and need to be treated as such from a security
+perspective.** 
+
+## Running untrusted models
+
+As a general rule: **Always** execute untrusted models inside a sandbox (e.g.,
+[nsjail](https://github.com/google/nsjail)). 
+
+There are several ways in which a model could become untrusted. Obviously, if an
+untrusted party supplies TensorFlow kernels, arbitrary code may be executed.
+The same is true if the untrusted party provides Python code, such as the
+Python code that generates TensorFlow graphs.
+
+Even if the untrusted party only supplies the serialized computation
+graph (in form of a `GraphDef`, `SavedModel`, or equivalent on-disk format), the
+set of computation primitives available to TensorFlow is powerful enough that
+you should assume that the TensorFlow process effectively executes arbitrary
+code. One common solution is to whitelist only a few safe Ops. While this is
+possible in theory, we still recommend you sandbox the execution.
+
+It depends on the computation graph whether a user provided checkpoint is safe.
+It is easily possible to create computation graphs in which malicious
+checkpoints can trigger unsafe behavior. For example, consider a graph that
+contains a `tf.cond` depending on the value of a `tf.Variable`. One branch of
+the `tf.cond` is harmless, but the other is unsafe. Since the `tf.Variable` is
+stored in the checkpoint, whoever provides the checkpoint now has the ability to
+trigger unsafe behavior, even though the graph is not under their control.
+
+In other words, graphs can contain vulnerabilities of their own. To allow users
+to provide checkpoints to a model you run on their behalf (e.g., in order to
+compare model quality for a fixed model architecture), you must carefully audit
+your model, and we recommend you run the TensorFlow process in a sandbox.
+
+## Accepting untrusted Inputs
+
+It is possible to write models that are secure in a sense that they can safely
+process untrusted inputs assuming there are no bugs. There are two main reasons
+to not rely on this: first, it is easy to write models which must not be exposed
+to untrusted inputs, and second, there are bugs in any software system of
+sufficient complexity. Letting users control inputs could allow them to trigger
+bugs either in TensorFlow or in dependent libraries.
+
+In general, it is good practice to isolate parts of any system which is exposed
+to untrusted (e.g., user-provided) inputs in a sandbox.
+
+A useful analogy to how any TensorFlow graph is executed is any interpreted
+programming language, such as Python. While it is possible to write secure
+Python code which can be exposed to user supplied inputs (by, e.g., carefully
+quoting and sanitizing input strings, size-checking input blobs, etc.), it is
+very easy to write Python programs which are insecure. Even secure Python code
+could be rendered insecure by a bug in the Python interpreter, or in a bug in a
+Python library used (e.g.,
+[this one](https://www.cvedetails.com/cve/CVE-2017-12852/)).
+
+## Running a TensorFlow server
+
+TensorFlow is a platform for distributed computing, and as such there is a
+TensorFlow server (`tf.train.Server`). **The TensorFlow server is meant for
+internal communication only. It is not built for use in an untrusted network.**
+
+For performance reasons, the default TensorFlow server does not include any
+authorization protocol and sends messages unencrypted. It accepts connections
+from anywhere, and executes the graphs it is sent without performing any checks.
+Therefore, if you run a `tf.train.Server` in your network, anybody with
+access to the network can execute what you should consider arbitrary code with
+the privileges of the process running the `tf.train.Server`.
+
+When running distributed TensorFlow, you must isolate the network in which the
+cluster lives. Cloud providers provide instructions for setting up isolated
+networks, which are sometimes branded as "virtual private cloud." Refer to the
+instructions for
+[GCP](https://cloud.google.com/compute/docs/networks-and-firewalls) and
+[AWS](https://aws.amazon.com/vpc/)) for details.
+
+Note that `tf.train.Server` is different from the server created by
+`tensorflow/serving` (the default binary for which is called `ModelServer`).
+By default, `ModelServer` also has no built-in mechanism for authentication.
+Connecting it to an untrusted network allows anyone on this network to run the
+graphs known to the `ModelServer`. This means that an attacker may run
+graphs using untrusted inputs as described above, but they would not be able to
+execute arbitrary graphs. It is possible to safely expose a `ModelServer`
+directly to an untrusted network, **but only if the graphs it is configured to
+use have been carefully audited to be safe**. 
+
+Similar to best practices for other servers, we recommend running any
+`ModelServer` with appropriate privileges (i.e., using a separate user with
+reduced permisisons). In the spirit of defense in depth, we recommend
+authenticating requests to any TensorFlow server connected to an untrusted
+network, as well as sandboxing the server to minimize the adverse effects of
+any breach.
+
+## Vulnerabilities in TensorFlow
+
+TensorFlow is a large and complex system. It also depends on a large set of
+third party libraries (e.g., `numpy`, `libjpeg-turbo`, PNG parsers, `protobuf`).
+It is possible that TensorFlow or its dependent libraries contain
+vulnerabilities that would allow triggering unexpected or dangerous behavior
+with specially crafted inputs.
+
+### What is a vulnerability?
+
+Given TensorFlow's flexibility, it is possible to specify computation graphs
+which exhibit unexpected or unwanted behaviors. The fact that TensorFlow models
+can perform arbitrary computations means that they may read and write files,
+communicate via the network, produce deadlocks and infinite loops, or run out
+of memory. It is only when these behaviors are outside the specifications of the
+operations involved that such behavior is a vulnerability. 
+
+A `FileWriter` writing a file is not unexpected behavior and therefore is not a
+vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution
+**is** a vulnerability.
+
+This is more subtle from a system perspective. For example, it is easy to cause
+a TensorFlow process to try to allocate more memory than available by specifying
+a computation graph containing an ill-considered `tf.tile` operation. TensorFlow
+should exit cleanly in this case (it would raise an exception in Python, or
+return an error `Status` in C++). However, if the surrounding system is not
+expecting the possibility, such behavior could be used in a denial of service
+attack (or worse). Because TensorFlow behaves correctly, this is not a
+vulnerability in TensorFlow (although it would be a vulnerability of this
+hypothetical system).
+
+As a general rule, it is incorrect behavior for Tensorflow to access memory it
+does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
+such behaviors constitute a vulnerability.
+
+One of the most critical parts of any system is input handling. If malicious
+input can trigger side effects or incorrect behavior, this is a bug, and likely
+a vulnerability.
+
+### Reporting vulnerabilities
+
+Please email reports about any security related issues you find to
+`security@tensorflow.org`. This mail is delivered to a small security team. Your
+email will be acknowledged within one business day, and you'll receive a more
+detailed response to your email within 7 days indicating the next steps in
+handling your report. For critical problems, you may encrypt your report (see
+below).
+
+Please use a descriptive subject line for your report email. After the initial
+reply to your report, the security team will endeavor to keep you informed of
+the progress being made towards a fix and announcement. 
+
+If you believe that an existing (public) issue is security-related, please send
+an email to `security@tensorflow.org`. The email should include the issue ID and
+a short description of why it should be handled according to this security
+policy.
+
+Once an issue is reported, TensorFlow uses the following disclosure process:
+
+* When a report is received, we confirm the issue and determine its severity.
+* If we know of specific third-party services or software based on TensorFlow
+  that require mitigation before publication, those projects will be notified.
+* An advisory is prepared (but not published) which details the problem and
+  steps for mitigation.
+* Wherever possible, fixes are prepared for the last minor release of the two
+  latest major releases, as well as the master branch. We will attempt to
+  commit these fixes as soon as possible, and as close together as
+  possible.
+* Patch releases are published for all fixed released versions, a
+  notification is sent to discuss@tensorflow.org, and the advisory is published.
+
+Past security advisories are listed below. We credit reporters for identifying
+security issues, although we keep your name confidential if you request it.
+
+#### Encryption key for `security@tensorflow.org`
+
+If your disclosure is extremely sensitive, you may choose to encrypt your
+report using the key below. Please only use this for critical security
+reports.
+
+```
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQENBFpqdzwBCADTeAHLNEe9Vm77AxhmGP+CdjlY84O6DouOCDSq00zFYdIU/7aI
+LjYwhEmDEvLnRCYeFGdIHVtW9YrVktqYE9HXVQC7nULU6U6cvkQbwHCdrjaDaylP
+aJUXkNrrxibhx9YYdy465CfusAaZ0aM+T9DpcZg98SmsSml/HAiiY4mbg/yNVdPs
+SEp/Ui4zdIBNNs6at2gGZrd4qWhdM0MqGJlehqdeUKRICE/mdedXwsWLM8AfEA0e
+OeTVhZ+EtYCypiF4fVl/NsqJ/zhBJpCx/1FBI1Uf/lu2TE4eOS1FgmIqb2j4T+jY
+e+4C8kGB405PAC0n50YpOrOs6k7fiQDjYmbNABEBAAG0LVRlbnNvckZsb3cgU2Vj
+dXJpdHkgPHNlY3VyaXR5QHRlbnNvcmZsb3cub3JnPokBTgQTAQgAOBYhBEkvXzHm
+gOJBnwP4Wxnef3wVoM2yBQJaanc8AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA
+AAoJEBnef3wVoM2yNlkIAICqetv33MD9W6mPAXH3eon+KJoeHQHYOuwWfYkUF6CC
+o+X2dlPqBSqMG3bFuTrrcwjr9w1V8HkNuzzOJvCm1CJVKaxMzPuXhBq5+DeT67+a
+T/wK1L2R1bF0gs7Pp40W3np8iAFEh8sgqtxXvLGJLGDZ1Lnfdprg3HciqaVAiTum
+HBFwszszZZ1wAnKJs5KVteFN7GSSng3qBcj0E0ql2nPGEqCVh+6RG/TU5C8gEsEf
+3DX768M4okmFDKTzLNBm+l08kkBFt+P43rNK8dyC4PXk7yJa93SmS/dlK6DZ16Yw
+2FS1StiZSVqygTW59rM5XNwdhKVXy2mf/RtNSr84gSi5AQ0EWmp3PAEIALInfBLR
+N6fAUGPFj+K3za3PeD0fWDijlC9f4Ety/icwWPkOBdYVBn0atzI21thPRbfuUxfe
+zr76xNNrtRRlbDSAChA1J5T86EflowcQor8dNC6fS+oHFCGeUjfEAm16P6mGTo0p
+osdG2XnnTHOOEFbEUeWOwR/zT0QRaGGknoy2pc4doWcJptqJIdTl1K8xyBieik/b
+nSoClqQdZJa4XA3H9G+F4NmoZGEguC5GGb2P9NHYAJ3MLHBHywZip8g9oojIwda+
+OCLL4UPEZ89cl0EyhXM0nIAmGn3Chdjfu3ebF0SeuToGN8E1goUs3qSE77ZdzIsR
+BzZSDFrgmZH+uP0AEQEAAYkBNgQYAQgAIBYhBEkvXzHmgOJBnwP4Wxnef3wVoM2y
+BQJaanc8AhsMAAoJEBnef3wVoM2yX4wIALcYZbQhSEzCsTl56UHofze6C3QuFQIH
+J4MIKrkTfwiHlCujv7GASGU2Vtis5YEyOoMidUVLlwnebE388MmaJYRm0fhYq6lP
+A3vnOCcczy1tbo846bRdv012zdUA+wY+mOITdOoUjAhYulUR0kiA2UdLSfYzbWwy
+7Obq96Jb/cPRxk8jKUu2rqC/KDrkFDtAtjdIHh6nbbQhFuaRuWntISZgpIJxd8Bt
+Gwi0imUVd9m9wZGuTbDGi6YTNk0GPpX5OMF5hjtM/objzTihSw9UN+65Y/oSQM81
+v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
+=CDME
+-----END PGP PUBLIC KEY BLOCK-----
+```
+
+### Known vulnerabilities
+
+| Type | Versions affected | Reported by | Additional Information |
+|------|:-----------------:|---------------------------------------|
+| out of bounds read| <=1.4 | @zhangbo5891001 | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 3c7f041b39f..e7fb1dec53b 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -195,10 +195,10 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
       reinterpret_cast<intptr_t>(data) % EIGEN_MAX_ALIGN_BYTES != 0) {
     // TF_STRING and TF_RESOURCE tensors have a different representation in
     // TF_Tensor than they do in tensorflow::Tensor. So a copy here is a waste
-    // (any alignement requirements will be taken care of by TF_TensorToTensor
+    // (any alignment requirements will be taken care of by TF_TensorToTensor
     // and TF_TensorFromTensor).
     //
-    // Other types have the same represntation, so copy only if it is safe to do
+    // Other types have the same representation, so copy only if it is safe to do
     // so.
     buf->data_ = allocate_tensor("TF_NewTensor", len);
     std::memcpy(buf->data_, data, len);
@@ -2144,7 +2144,7 @@ Status CopyGraph(Graph* src_graph, Graph* dst_graph,
     opts.return_tensors.push_back(ToTensorId(nodes_to_return[i]));
   }
 
-  // TOOD(skyewm): change to OutputTensor
+  // TODO(skyewm): change to OutputTensor
   tensorflow::ImportGraphDefResults results;
   TF_RETURN_IF_ERROR(
       ImportGraphDef(opts, gdef, dst_graph, dst_refiner, &results));
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 74190cb135a..e62310d8114 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -46,6 +46,7 @@ tf_cuda_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index a76c8f5ec05..d65b5928959 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -85,15 +85,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
     return nullptr;
   }
 
-  TFE_Context* ret = new TFE_Context(session);
-  ret->policy = opts->policy;
-  ret->pflr.reset(new tensorflow::ProcessFunctionLibraryRuntime(
-      ret->session->device_mgr, opts->session_options.options.env,
-      TF_GRAPH_DEF_VERSION, &ret->func_lib_def, {}));
-  ret->rendezvous =
-      new tensorflow::IntraProcessRendezvous(ret->session->device_mgr);
-
-  return ret;
+  return new TFE_Context(*opts, session);
 }
 
 void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) {
@@ -261,15 +253,6 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
 
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
 
-static void TFE_OpSetDeviceHelper(TFE_Op* op, tensorflow::Device* device,
-                                  TF_Status* status) {
-  // Questionable heuristic: Place the op on the same device as the first input
-  // placed outside of host memory?
-  if (IsCPU(op->device) && !IsCPU(device)) {
-    op->device = device;
-  }
-}
-
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
   tensorflow::Device* d = nullptr;
   if (device_name != nullptr && strlen(device_name) > 0) {
@@ -277,11 +260,24 @@ void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
         op->ctx->session->device_mgr->LookupDevice(device_name, &d);
     if (!status->status.ok()) return;
   }
-  TFE_OpSetDeviceHelper(op, d, status);
+  op->device = d;
+}
+
+const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
+  tensorflow::Device* device =
+      (op->device == nullptr) ? op->ctx->devices()[0] : op->device;
+  return device->name().c_str();
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  TFE_OpSetDeviceHelper(op, h->d, status);
+  // Questionable heuristic ...
+  //
+  // Motivation: After an 'op' is placed on GPU because some of its earlier
+  // inputs are on GPU, we want to keep the 'op' there, even if some later
+  // inputs of it are not on GPU.
+  if (IsCPU(op->device) && !IsCPU(h->d)) {
+    op->device = h->d;
+  }
   if (!status->status.ok()) return;
   op->inputs.push_back(h->t);
   op->input_devices.push_back(h->d);
@@ -298,7 +294,7 @@ TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
     return TF_ATTR_INT;  // The compiler requires that we return something.
   }
   status->status =
-      tensorflow::AttrTypeByName(op->attr_types, attr_name, &ret, is_list);
+      tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list);
   return ret;
 }
 
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 387de078948..6a2aff1591d 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -154,6 +154,9 @@ TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
 TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
                                            TF_Status* status);
+// The returned string remains valid throughout the lifetime of 'op'.
+TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
+                                                  TF_Status* status);
 
 TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index a6f76c732f2..f2abffb7bc0 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/version.h"
 
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
@@ -43,9 +44,15 @@ struct TFE_ContextOptions {
 };
 
 struct TFE_Context {
-  explicit TFE_Context(TF_Session* s) : session(s) {}
+  explicit TFE_Context(const TFE_ContextOptions& opts, TF_Session* s)
+      : policy(opts.policy),
+        session(s),
+        rendezvous(new tensorflow::IntraProcessRendezvous(s->device_mgr)),
+        pflr(new tensorflow::ProcessFunctionLibraryRuntime(
+            session->device_mgr, opts.session_options.options.env,
+            TF_GRAPH_DEF_VERSION, &func_lib_def, {})) {}
 
-  TFE_ContextDevicePlacementPolicy policy;
+  const TFE_ContextDevicePlacementPolicy policy;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
   // thread-local-object-local variable in C++11.
@@ -54,8 +61,8 @@ struct TFE_Context {
       thread_local_policies GUARDED_BY(policy_map_mu);
 
   // TFE_Context is an extension of TF_Session. And TF_Session needs a TF_Graph.
-  TF_Session* session;
-  tensorflow::Rendezvous* rendezvous;
+  TF_Session* const session;
+  tensorflow::Rendezvous* const rendezvous;
 
   tensorflow::mutex functions_mu;
   tensorflow::FunctionLibraryDefinition func_lib_def GUARDED_BY(functions_mu){
@@ -64,14 +71,14 @@ struct TFE_Context {
   // One FunctionLibraryRuntime per device.
   // func_libs[i] is the FunctionLibraryRuntime corresponding to
   // session->devices[i].
-  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
+  const std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
 
   tensorflow::mutex cache_mu;
   std::unordered_map<tensorflow::Fprint128, tensorflow::KernelAndDevice*,
                      tensorflow::Fprint128Hasher>
       kernel_cache GUARDED_BY(cache_mu);
 
-  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) {
+  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) const {
     return pflr->GetFLR(d->name());
   }
 
@@ -100,6 +107,8 @@ struct TFE_TensorHandle {
 };
 
 struct TFE_Op {
+  // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
+  // primitive operation.
   TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
       : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {}
 
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 18e7a64435e..b0409af87c2 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -60,6 +60,31 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   return op;
 }
 
+// If there is a GPU device, returns true and sets 'gpu_device_name'
+// accordingly.
+bool GetGPUDeviceName(TFE_Context* ctx, string* gpu_device_name) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
+  CHECK_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  const int num_devices = TF_DeviceListCount(devices);
+  for (int i = 0; i < num_devices; ++i) {
+    const string device_type(TF_DeviceListType(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    const string device_name(TF_DeviceListName(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    if (device_type == "GPU") {
+      *gpu_device_name = device_name;
+      LOG(INFO) << "Found GPU device " << device_name;
+      TF_DeleteDeviceList(devices);
+      return true;
+    }
+  }
+  TF_DeleteDeviceList(devices);
+  return false;
+}
+
 void BM_InitOp(int iters) {
   tensorflow::testing::StopTiming();
   TF_Status* status = TF_NewStatus();
@@ -288,22 +313,15 @@ TEST(CAPI, TensorHandleSilentCopy) {
   TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  const int num_devices = TF_DeviceListCount(devices);
-
   // Disable the test if no GPU is present.
-  if (num_devices > 1) {
-    const int device_to_use = 1;
-    const string name(TF_DeviceListName(devices, device_to_use, status.get()));
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    TFE_TensorHandle* hgpu =
-        TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
-    TFE_OpSetDevice(matmul, name.c_str(), status.get());
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
     TFE_TensorHandle* retvals[1];
     int num_retvals = 1;
@@ -314,7 +332,6 @@ TEST(CAPI, TensorHandleSilentCopy) {
     TFE_DeleteTensorHandle(hgpu);
   }
 
-  TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
   TFE_DeleteContext(ctx, status.get());
@@ -337,22 +354,15 @@ TEST(CAPI, TensorHandleSilentCopyLocal) {
   TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  const int num_devices = TF_DeviceListCount(devices);
-
   // Disable the test if no GPU is present.
-  if (num_devices > 1) {
-    const int device_to_use = 1;
-    const string name(TF_DeviceListName(devices, device_to_use, status.get()));
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    TFE_TensorHandle* hgpu =
-        TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
-    TFE_OpSetDevice(matmul, name.c_str(), status.get());
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
     TFE_TensorHandle* retvals[1];
     int num_retvals = 1;
@@ -363,13 +373,43 @@ TEST(CAPI, TensorHandleSilentCopyLocal) {
     TFE_DeleteTensorHandle(hgpu);
   }
 
-  TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
   TFE_DeleteContext(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
+TEST(CAPI, SetAndGetOpDevices) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_OpSetDevice(matmul, "GPU:0", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    const char* device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "GPU:0") != nullptr);
+
+    TFE_OpSetDevice(matmul, "CPU:0", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "CPU:0") != nullptr);
+  }
+
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, Execute) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index 3a9951e14de..12abfcba2f0 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -86,10 +86,9 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
   return Status::OK();
 }
 
-Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name,
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list) {
-  CHECK(m);
-  auto* t = gtl::FindOrNull(*m, attr_name);
+  auto* t = gtl::FindOrNull(m, attr_name);
   if (t == nullptr) {
     return errors::InvalidArgument("Attribute '", attr_name,
                                    "' does not exist for this operation");
@@ -173,14 +172,14 @@ void CombineUnordered(const tensorflow::Fprint128& a,
   b->high64 += a.high64;
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s,
+inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s,
                                             const tensorflow::Fprint128& b) {
   // TODO(agarwal): avoid ToString().
   tensorflow::Fprint128 a = tensorflow::Fingerprint128(s.ToString());
   return FingerprintCat128(a, b);
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s, uint64 b) {
+inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, uint64 b) {
   return CacheKeyHelper(s, {b, b});
 }
 
diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/c/eager/runtime.h
index e28a416e67f..4d20b5244a4 100644
--- a/tensorflow/c/eager/runtime.h
+++ b/tensorflow/c/eager/runtime.h
@@ -43,7 +43,7 @@ typedef std::unordered_map<string, uint32> AttrTypeMap;
 Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name,
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
 
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/c/eager/runtime_test.cc
index 2ccca66f672..643153058ce 100644
--- a/tensorflow/c/eager/runtime_test.cc
+++ b/tensorflow/c/eager/runtime_test.cc
@@ -63,17 +63,17 @@ TEST(AttrTypeMap, Lookup) {
 
   TF_AttrType t;
   unsigned char is_list = 1;
-  s = AttrTypeByName(m, "ThisAttribyteCannotPossiblyExist", &t, &is_list);
+  s = AttrTypeByName(*m, "ThisAttribyteCannotPossiblyExist", &t, &is_list);
   EXPECT_FALSE(s.ok());
   EXPECT_NE(is_list, 0);
-  s = AttrTypeByName(m, "transpose_a", &t, &is_list);
+  s = AttrTypeByName(*m, "transpose_a", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
   EXPECT_EQ(TF_ATTR_BOOL, t);
   EXPECT_EQ(is_list, 0);
 
   s = AttrTypeMapForOp("Squeeze", &m);
   ASSERT_TRUE(s.ok()) << s;
-  s = AttrTypeByName(m, "squeeze_dims", &t, &is_list);
+  s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
   EXPECT_EQ(TF_ATTR_INT, t);
   EXPECT_NE(is_list, 0);
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 2b65e38f540..bdb0815d6b6 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -18,12 +18,12 @@ limitations under the License.
 // Language-agnostic gradient tape. Does not perform backpropagation, just
 // maintains the data structures required to do so.
 
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -54,11 +54,11 @@ struct OpTapeEntry {
 // Map from tensor_id to internally-defined operation-id of the operation which
 // produced this tensor. A value of -1 means that the tensor was directly
 // watched and not the result of any operation in the tape.
-using TensorTape = std::unordered_map<int64, int64>;
+using TensorTape = gtl::FlatMap<int64, int64>;
 
 // Map from operation-id to tape entry.
 template <typename BackwardFunction>
-using OpTape = std::unordered_map<int64, OpTapeEntry<BackwardFunction>>;
+using OpTape = gtl::FlatMap<int64, OpTapeEntry<BackwardFunction>>;
 
 // Operations the tape needs to perform on tensors to do backpropagation. Named
 // "vspace" because a subset of these are related to a vector space, such as
@@ -159,7 +159,7 @@ class GradientTape {
 
   // Map from tensor id to number of remaining usages (i.e. how many entries in
   // the tape refer to it); to aid in tape garbage collection.
-  std::unordered_map<int64, int64> tensor_usage_;
+  gtl::FlatMap<int64, int64> tensor_usage_;
 
   // If false, all activations are deleted in the first call to ComputeGradient.
   // Else, only when this is destructed.
@@ -286,11 +286,11 @@ struct BackpropInitialState {
 
   // Map from tensor ID to how many references still exist for this tensor in
   // the tape.
-  std::unordered_map<int64, int64> tensor_usage_counts;
+  gtl::FlatMap<int64, int64> tensor_usage_counts;
 
   // Maps from op ID to how many output tensors of this op still need to have
   // their gradients computed.
-  std::unordered_map<int64, int64> op_missing_tensor;
+  gtl::FlatMap<int64, int64> op_missing_tensor;
 };
 
 // If `persistent_tape` is true, op_tape is not changed and none of the
@@ -301,8 +301,8 @@ struct BackpropInitialState {
 template <typename BackwardFunction>
 BackpropInitialState<BackwardFunction> PrepareBackprop(
     gtl::ArraySlice<int64> target, const TensorTape& tensor_tape,
-    OpTape<BackwardFunction>* op_tape,
-    const std::unordered_set<int64>& sources_set, bool persistent_tape) {
+    OpTape<BackwardFunction>* op_tape, const gtl::FlatSet<int64>& sources_set,
+    bool persistent_tape) {
   std::vector<int64> tensor_stack;
   tensor_stack.reserve(target.size());
   for (auto t : target) {
@@ -362,7 +362,7 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
 template <typename BackwardFunction>
 std::vector<int64> InitialStack(
     const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& op_missing_tensor) {
+    const gtl::FlatMap<int64, int64>& op_missing_tensor) {
   std::vector<int64> result;
   for (auto& op_entry : op_tape) {
     if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) {
@@ -373,13 +373,13 @@ std::vector<int64> InitialStack(
 }
 
 template <typename Gradient, typename BackwardFunction>
-Status InitialGradients(
-    const VSpace<Gradient, BackwardFunction>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
-    const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& tensor_usage_counts,
-    std::unordered_map<int64, std::vector<Gradient*>>* result) {
+Status InitialGradients(const VSpace<Gradient, BackwardFunction>& vspace,
+                        gtl::ArraySlice<int64> target_tensor_ids,
+                        gtl::ArraySlice<Gradient*> output_gradients,
+                        const TensorTape& tensor_tape,
+                        const OpTape<BackwardFunction>& op_tape,
+                        const gtl::FlatMap<int64, int64>& tensor_usage_counts,
+                        gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
   for (int i = 0; i < target_tensor_ids.size(); ++i) {
     const int64 id = target_tensor_ids[i];
     if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
@@ -441,13 +441,13 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     gtl::ArraySlice<int64> source_tensor_ids,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
-  std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
-                                        source_tensor_ids.end());
+  gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
+                                  source_tensor_ids.end());
   BackpropInitialState<BackwardFunction> state = PrepareBackprop(
       target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_);
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
-  std::unordered_map<int64, std::vector<Gradient*>> gradients;
+  gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
   Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
                               tensor_tape_, state.op_tape,
                               state.tensor_usage_counts, &gradients);
@@ -463,7 +463,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     cleanup();
     return s;
   }
-  std::unordered_map<int64, int64> gradients_size;
+  gtl::FlatMap<int64, int64> gradients_size;
   // TODO(apassos) multiple threads could be dequeuing from op_stack at the same
   // time, for better CPU backprop performance.
   VLOG(1) << "Initial stack:";
@@ -472,11 +472,10 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       VLOG(1) << "  " << t;
     }
   }
-  std::unordered_map<string, std::unordered_set<int>>
-      functions_accept_none_for_indices({
-          {"SoftmaxCrossEntropyWithLogits", {1}},
-          {"FusedBatchNorm", {1, 2, 3, 4}},
-      });
+  gtl::FlatMap<string, gtl::FlatSet<int>> functions_accept_none_for_indices({
+      {"SoftmaxCrossEntropyWithLogits", {1}},
+      {"FusedBatchNorm", {1, 2, 3, 4}},
+  });
   while (!op_stack.empty()) {
     const int64 op = op_stack.back();
     VLOG(1) << "Popped " << op;
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index c9ade5fb83f..9060c19e9d2 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -433,6 +433,7 @@ tf_gen_op_wrappers_cc(
         "linalg_ops",
         "logging_ops",
         "lookup_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index acef098c7d0..faa1e378d07 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -96,7 +96,9 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
-  session->reset(NewSession(session_options));
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
   return (*session)->Create(meta_graph_def.graph_def());
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 0ad6b33bba5..4c64d2cfe3c 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -155,6 +155,24 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
       << st.error_message();
 }
 
+TEST_F(LoaderTest, SessionCreationFailure) {
+  SavedModelBundle bundle;
+  // Use invalid SessionOptions to cause session creation to fail.  Default
+  // options work, so provide an invalid value for the target field.
+  SessionOptions session_options;
+  constexpr char kInvalidTarget[] = "invalid target";
+  session_options.target = kInvalidTarget;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+      << st.error_message();
+}
+
 TEST_F(LoaderTest, PbtxtFormat) {
   SavedModelBundle bundle;
   SessionOptions session_options;
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index 0a7c37383f9..97f66e79b8a 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -23,7 +23,6 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 2b9c83ba149..58572fea3db 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -4,7 +4,7 @@
 
 To use from your BUILD file, add the following line to load the macro:
 
-load("@org_tensorflow//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 Then call the macro like this:
 
@@ -16,14 +16,15 @@ tf_library(
 )
 """
 
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "if_android", "tf_copts")
+load("//tensorflow:tensorflow.bzl",
+     "if_android", "tf_cc_test", "tf_copts")
 
 def tf_library(name, graph, config,
                freeze_checkpoint=None, freeze_saver=None,
                cpp_class=None, gen_test=True, gen_benchmark=True,
                visibility=None, testonly=None,
                tfcompile_flags=None,
-               tfcompile_tool="@org_tensorflow//tensorflow/compiler/aot:tfcompile",
+               tfcompile_tool="//tensorflow/compiler/aot:tfcompile",
                include_standard_runtime_deps=True, deps=None, tags=None):
   """Runs tfcompile to compile a TensorFlow graph into executable code.
 
@@ -119,9 +120,9 @@ def tf_library(name, graph, config,
             out_nodes_file,
         ] + freeze_saver_srcs,
         outs=[freeze_file],
-        cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" +
+        cmd=("$(location //tensorflow/python/tools:freeze_graph)" +
              freeze_args),
-        tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"],
+        tools=["//tensorflow/python/tools:freeze_graph"],
         tags=tags,
     )
     tfcompile_graph = freeze_file
@@ -213,22 +214,22 @@ def tf_library(name, graph, config,
           # These deps are required by all tf_library targets even if
           # include_standard_runtime_deps is False.  Without them, the
           # generated code will fail to compile.
-          "@org_tensorflow//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
-          "@org_tensorflow//tensorflow/core:framework_lite",
+          "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+          "//tensorflow/core:framework_lite",
       ] + (need_xla_data_proto and [
           # If we're generating the program shape, we must depend on the proto.
-          "@org_tensorflow//tensorflow/compiler/xla:xla_data_proto",
+          "//tensorflow/compiler/xla:xla_data_proto",
       ] or []) + (include_standard_runtime_deps and [
           # TODO(cwhipkey): only depend on kernel code that the model actually needed.
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_matmul",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
+          "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_matmul",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
           "//third_party/eigen3",
       ] or []) + (deps or []),
       tags=tags,
@@ -254,28 +255,32 @@ def tf_library(name, graph, config,
         name=("gen_" + test_name),
         testonly=1,
         srcs=[
-            "@org_tensorflow//tensorflow/compiler/aot:test.cc",
+            "//tensorflow/compiler/aot:test.cc",
             header_file,
         ],
         outs=[test_file],
         cmd=("sed " + sed_replace +
-             " $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " +
+             " $(location //tensorflow/compiler/aot:test.cc) " +
              "> $(OUTS)"),
         tags=tags,
     )
 
-    # The cc_test rule for the generated code.
-    native.cc_test(
+    # The cc_test rule for the generated code.  To ensure that this works
+    # reliably across build configurations, we must use tf_cc_test instead of
+    # native.cc_test.  This is related to how we build
+    # //tensorflow/core:lib -- see the note in tensorflow/core/BUILD
+    # for more details.
+    tf_cc_test(
         name=test_name,
         srcs=[test_file],
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/aot:tf_library_test_main",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/aot:tf_library_test_main",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
-            "@org_tensorflow//tensorflow/core:lib",
-            "@org_tensorflow//tensorflow/core:test",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
             ],
         tags=tags,
     )
@@ -283,7 +288,7 @@ def tf_library(name, graph, config,
   if gen_benchmark:
     benchmark_name = name + "_benchmark"
     benchmark_file = benchmark_name + ".cc"
-    benchmark_main = ("@org_tensorflow//tensorflow/compiler/aot:" +
+    benchmark_main = ("//tensorflow/compiler/aot:" +
                       "benchmark_main.template")
 
     # Rule to rewrite benchmark.cc to produce the benchmark_file.
@@ -301,7 +306,9 @@ def tf_library(name, graph, config,
         tags=tags,
     )
 
-    # The cc_benchmark rule for the generated code.
+    # The cc_benchmark rule for the generated code.  This does not need the
+    # tf_cc_binary since we (by deliberate design) do not depend on
+    # //tensorflow/core:lib.
     #
     # Note: to get smaller size on android for comparison, compile with:
     #    --copt=-fvisibility=hidden
@@ -315,12 +322,12 @@ def tf_library(name, graph, config,
         linkopts = if_android(["-pie", "-s"]),
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark",
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:benchmark",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
         ] + if_android([
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark_extra_android",
+            "//tensorflow/compiler/aot:benchmark_extra_android",
         ]),
         tags=tags,
     )
@@ -330,11 +337,11 @@ def target_llvm_triple():
   # TODO(toddw): Add target_triple for other targets.  For details see:
   # http://llvm.org/docs/doxygen/html/Triple_8h_source.html
   return select({
-      "@org_tensorflow//tensorflow:android_armeabi": "armv5-none-android",
-      "@org_tensorflow//tensorflow:android_arm": "armv7-none-android",
-      "@org_tensorflow//tensorflow:android_arm64": "aarch64-none-android",
-      "@org_tensorflow//tensorflow:android_x86": "i686-none-android",
-      "@org_tensorflow//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
-      "@org_tensorflow//tensorflow:darwin": "x86_64-none-darwin",
+      "//tensorflow:android_armeabi": "armv5-none-android",
+      "//tensorflow:android_arm": "armv7-none-android",
+      "//tensorflow:android_arm64": "aarch64-none-android",
+      "//tensorflow:android_x86": "i686-none-android",
+      "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
+      "//tensorflow:darwin": "x86_64-none-darwin",
       "//conditions:default": "x86_64-pc-linux",
   })
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 0de163d3a8f..9c372a01278 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -141,8 +143,7 @@ struct NodeSlot {
 // everything to use it.
 static const char* const kArgOp = "_Arg";
 static const char* const kRetValOp = "_Retval";
-static const char* const kSendToHostOp = "_XlaSendToHost";
-static const char* const kRecvFromHostOp = "_XlaRecvFromHost";
+static const char* const kHostComputeOp = "_XlaHostCompute";
 static const char* const kSendFromHostOp = "_XlaSendFromHost";
 static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
@@ -171,7 +172,8 @@ class Encapsulator {
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
   // replaced with calls to the new functions.
-  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out);
+  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                          FunctionLibraryDefinition* library);
 
  private:
   // A subgraph of the input, all marked with a common 'group_attribute'
@@ -201,21 +203,29 @@ class Encapsulator {
   //     ..             .
   //  RAH -->  C  --> SFH
   //
-  // The compiled cluster is as follows. STH is a SendToHost node which is the
-  // source of a channel to the RAH node above. RFH is a RecvFromHost node which
-  // is the destination of a channel from the SFH node above. There is a control
-  // edge that ensures RFH follows STH, which is used in shape inference to
-  // ensure that the shapes on the STH host channel are known before the RFH
-  // channel is compiled.
+  // The compiled cluster is as follows. HC is a HostCompute node which is the
+  // source of a channel to the RAH node above and the destination of a channel
+  // from the SFH node above.
   //
-  //  Arg  --> B  --> STH  ..>  RFH  --> D --> Retval
+  //  Arg  --> B  --> HC  --> D --> Retval
   //
-  // The channels STH/RAH and SFH/RFH each transmit a tuple, so there is at most
-  // one RAH and SFH in each compiled cluster. This design is preferred over
-  // adding separate Arg/Retval nodes for each transmitted value because it
-  // simplifies the host code that would like to limit communication between
-  // host and device and, e.g., raise only one interrupt per channel rather than
-  // one per transmitted value.
+  // The channels HC/RAH and SFH/HC each transmit multiple tensors, so there is
+  // at most one RAH and SFH in each outside_compilation cluster. This design is
+  // preferred over adding separate Arg/Retval nodes for each transmitted value
+  // because it allows optimizations to the host code that would like to limit
+  // communication between host and device and, e.g., raise only one interrupt
+  // per channel rather than one per transmitted value.
+  //
+  // The shapes of the outputs from the HC node in general cannot be determined
+  // until the shapes of its inputs are known at compile time, since e.g.,
+  // above, the shape of C's outputs aren't known until the shape of its inputs
+  // are known. If the shapes of the HC's outputs can be determined during the
+  // rewrite, they are stored in the node's 'shapes' attr. Otherwise a minimal
+  // graph is stored in the shape_inference_graph attr. This graph can be used
+  // when compiling the HC Op to determined the shape of the SFH inputs given
+  // the shapes of any ancestor RAH outputs. If it can be determined that the
+  // shape of the SFH inputs will not be inferrable even once the shapes of the
+  // RAH outputs are known, an error is returned by the rewriter.
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -246,6 +256,10 @@ class Encapsulator {
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
+    // Returns the names of all the outside_compilation subgraphs in this
+    // Subgraph.
+    void GetOutsideCompilationSubgraphNames(std::vector<string>* names) const;
+
     // Returns the Node that inputs to the function should be wired up to.
     Node* GetCallNodeForInputs() const;
 
@@ -305,15 +319,9 @@ class Encapsulator {
     void RecordOutsideCompilationOutputOrControl(
         const string& outside_compilation_id, const Edge* edge);
 
-    // Adds the SendToHost nodes for each outside_compilation subgraph once the
-    // edges have all been recorded via RecordOutsideCompilationInputOrControl.
-    Status AddSendsToOutsideCompilation(
-        const std::unordered_map<const Node*, Node*>& node_images);
-
-    // Adds the RecvFromHost nodes for each outside_compilation subgraph once
-    // the edges have all been recorded via
-    // RecordOutsideCompilationOutputOrControl.
-    Status AddRecvsFromOutsideCompilation(
+    // Adds the HostCompute nodes for each outside_compilation subgraph.
+    Status AddHostComputes(
+        const string& subgraph_name,
         const std::unordered_map<const Node*, Node*>& node_images);
 
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
@@ -323,10 +331,16 @@ class Encapsulator {
     // all the downstream nodes of call_node_outputs.
     void ConnectSequencerToOutputs(Graph* graph_out);
 
+    Status AddShapeInferenceInfo(
+        const string& outside_compilation_subgraph_name,
+        const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph);
+
+    Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
+
    private:
     struct OutsideCompilationSubgraph {
       // Map from source (producer node/slot) tensors in the original graph to
-      // input index (slot number in the SendToHost/RecvAtHost nodes that will
+      // input index (slot number in the HostCompute/RecvAtHost nodes that will
       // be created) for the outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> inputs;
 
@@ -335,14 +349,14 @@ class Encapsulator {
       // outside_compilation subgraph. These are recorded by
       // RecordOutsideCompilationInputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddSendsToOutsideCompilation once the _SendToHost node has been
+      // AddSendsToOutsideCompilation once the _HostCompute node has been
       // created. The matching control edge from _RecvAtHost to the
       // destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_inputs;
 
       // Maps from source (producer node/slot) and destination (consumer
       // node/slot) tensors in the original graph to output index (slot number
-      // in the SendFromHost/RecvFromHost nodes that will be created) for the
+      // in the SendFromHost/HostCompute nodes that will be created) for the
       // outside_compilation subgraph.
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_src;
       std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_dst;
@@ -352,13 +366,13 @@ class Encapsulator {
       // containing compiled subgraph. These are recorded by
       // RecordOutsideCompilationOutputOrControl while walking all the subgraph
       // edges, and lifted control edges within the subgraph are added by
-      // AddRecvsFromToOutsideCompilation once the _RecvFromHost node has been
+      // AddRecvsFromToOutsideCompilation once the _HostCompute node has been
       // created. The matching control edge from the source to _SendFromHost to
       // the destination is added by CopyEdgeToOutputGraph.
       std::unordered_set<const Node*> control_outputs;
 
-      // _SendToHost node in the subgraph. Not owned.
-      Node* send_to_host = nullptr;
+      // Name of the _HostCompute node in the subgraph.
+      string host_compute_name;
 
       // _RecvAtHost node in the output graph. Not owned.
       Node* recv_at_host = nullptr;
@@ -516,6 +530,59 @@ class Encapsulator {
       const std::unordered_map<const Node*, Node*>& node_images,
       bool parallel_checking, Graph* graph_out);
 
+  // Constructs a minimal shape inference graph that can be used to determine
+  // the shape of send_node at the time that the subgraph is compiled.
+  // recv_at_host_nodes contains the names of all the recv_at_host nodes that
+  // send_node might depend on. These recv_at_host nodes have shapes that are
+  // not known during the rewrite pass, but will be known at compile time.
+  //
+  // If the shapes of all the inputs to send_node can be determined during the
+  // rewrite pass, on exit graphdef_out is empty and the shapes are returned in
+  // static_shape_out. Otherwise graphdef_out contains a graph that can be used
+  // for shape inference at compile time, where all the source nodes of the
+  // graph are either constants with known shapes, or nodes named in
+  // recv_at_host_nodes.
+  //
+  // A non-OK status is returned if neither of the above conditions can be
+  // satisfied, e.g., because send_node depends on a node that doesn't have a
+  // registered shape inference function.
+  Status DoStaticShapeInferenceForOutsideCompilationSend(
+      const Graph& graph_in, const ShapeRefiner& shape_refiner,
+      const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+      FunctionLibraryDefinition* library,
+      std::vector<TensorShapeProto>* static_shape_out,
+      std::unique_ptr<GraphDef>* graphdef_out);
+
+  // Makes a copy of graph containing only nodes that are ancestors of at least
+  // one node in send_from_host_nodes and store it in pruned_graph. On exit
+  // nodes_images contains a mapping from nodes in graph to nodes in
+  // pruned_graph. All functions in the copied graph are inlined.
+  Status MakePrunedGraphCopyAndInline(
+      const Graph& graph, const std::vector<Node*>& sink_nodes,
+      std::unique_ptr<Graph>* pruned_graph,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Makes a copy of graph containing only nodes that are ancestors of a
+  // send_from_host node in an outside_compilation subgraph, and store it in
+  // pruned_graph. Also perform shape inference on the pruned graph, using
+  // shape_refiner. On exit node_images contains a mapping from nodes in graph
+  // to nodes in pruned_graph.
+  Status MakeGraphForOutsideCompilationSends(
+      const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+      ShapeRefiner* shape_refiner,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Performs static shape inference, as far as possible, for the send_from_host
+  // nodes in each outside_compilation subgraph. Where it is not possible to
+  // determine the shape statically, stores a serialized GraphDef in the
+  // HostCompute 'shape_inference_graph' attr, to be used at compile time for
+  // final inference. If the shapes are known statically they are stored in the
+  // HostCompute 'shapes' attr.
+  Status GetShapeInfoForOutsideCompilationSends(
+      Graph* graph_out, FunctionLibraryDefinition* library);
+
   const string group_attribute_;
   const string outside_compilation_attribute_;
   const Graph* graph_in_;
@@ -682,16 +749,20 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
   }
 }
 
-Status Encapsulator::Subgraph::AddSendsToOutsideCompilation(
+Status Encapsulator::Subgraph::AddHostComputes(
+    const string& subgraph_name,
     const std::unordered_map<const Node*, Node*>& node_images) {
   for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
     const string& oc_subgraph_name = oc_subgraph_iter.first;
     OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
-    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
-      // Build a _SendToHost node sending all the args of the appropriate
-      // types.
-      std::vector<DataType> dtypes(oc_subgraph.inputs.size(), DT_INVALID);
+    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
+        !oc_subgraph.outputs_by_src.empty() ||
+        !oc_subgraph.control_outputs.empty()) {
+      // Build a _HostCompute node.
       std::vector<NodeDefBuilder::NodeOut> inputs(oc_subgraph.inputs.size());
+      std::vector<DataType> input_dtypes(oc_subgraph.inputs.size(), DT_INVALID);
+      std::vector<DataType> output_dtypes(oc_subgraph.outputs_by_src.size(),
+                                          DT_INVALID);
 
       for (const auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
@@ -700,94 +771,64 @@ Status Encapsulator::Subgraph::AddSendsToOutsideCompilation(
         int input_index = input_src.second;
 
         DataType dtype = src_node->output_type(src_slot);
-        dtypes[input_index] = dtype;
         inputs[input_index].Reset(src_image->name(), src_slot, dtype);
+        input_dtypes[input_index] = dtype;
       }
 
-      NodeDef send_def;
-      NodeDefBuilder builder(
-          strings::StrCat("outside_compilation_", oc_subgraph_name, "_send"),
-          kSendToHostOp);
-      builder.Attr("dtypes", dtypes);
+      for (const auto& output : oc_subgraph.outputs_by_src) {
+        DataType dtype = output.first.dtype;
+        int output_index = output.second;
+        output_dtypes[output_index] = dtype;
+      }
+
+      NodeDef host_compute_def;
+      NodeDefBuilder builder(strings::StrCat("outside_compilation_",
+                                             oc_subgraph_name, "_host_compute"),
+                             kHostComputeOp);
       builder.Input(inputs);
-      Status s = builder.Finalize(&send_def);
+      builder.Attr("Tinputs", input_dtypes);
+      builder.Attr("Toutputs", output_dtypes);
+      builder.Attr("key",
+                   strings::StrCat("host_compute_channel_", subgraph_name, "_",
+                                   oc_subgraph_name));
+      Status s = builder.Finalize(&host_compute_def);
       if (!s.ok()) return s;
 
-      oc_subgraph.send_to_host = graph_->AddNode(send_def, &s);
+      Node* host_compute = graph_->AddNode(host_compute_def, &s);
       if (!s.ok()) return s;
+      oc_subgraph.host_compute_name = host_compute->name();
 
-      // Connect the _SendToHost node to its producers in the subgraph.
+      // Connect the _HostCompute node to its producers in the subgraph.
       for (auto& input_src : oc_subgraph.inputs) {
         const Node* src_node = input_src.first.node;
         Node* src_image = node_images.at(src_node);
         int src_slot = input_src.first.slot;
         int input_index = input_src.second;
-        graph_->AddEdge(src_image, src_slot, oc_subgraph.send_to_host,
-                        input_index);
+        graph_->AddEdge(src_image, src_slot, host_compute, input_index);
       }
 
-      // Connect the _SendToHost node to its control edge producers in the
+      // Connect the _HostCompute node to its control edge producers in the
       // subgraph.
       for (const auto& src_node : oc_subgraph.control_inputs) {
         Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, oc_subgraph.send_to_host);
-      }
-    }
-  }
-
-  return Status::OK();
-}
-
-Status Encapsulator::Subgraph::AddRecvsFromOutsideCompilation(
-    const std::unordered_map<const Node*, Node*>& node_images) {
-  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
-    const string& oc_subgraph_name = oc_subgraph_iter.first;
-    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
-    if (!oc_subgraph.outputs_by_src.empty() ||
-        !oc_subgraph.control_outputs.empty()) {
-      // Build a _RecvFromHost node producing all the outputs of the appropriate
-      // types.
-      std::vector<DataType> dtypes(oc_subgraph.outputs_by_src.size(),
-                                   DT_INVALID);
-
-      for (const auto& output : oc_subgraph.outputs_by_src) {
-        DataType dtype = output.first.dtype;
-        int output_index = output.second;
-        dtypes[output_index] = dtype;
+        graph_->AddControlEdge(src_image, host_compute);
       }
 
-      NodeDef recv_def;
-      NodeDefBuilder builder(
-          strings::StrCat("outside_compilation_", oc_subgraph_name, "_recv"),
-          kRecvFromHostOp);
-      builder.Attr("dtypes", dtypes);
-      Status s = builder.Finalize(&recv_def);
-      if (!s.ok()) return s;
-
-      Node* recv = graph_->AddNode(recv_def, &s);
-      if (!s.ok()) return s;
-
-      // Connect the consumers in the subgraph to the _RecvFromHost node.
+      // Connect the consumers in the subgraph to the _HostCompute node.
       for (const auto& output : oc_subgraph.outputs_by_dst) {
         const Node* dst_node = output.first.node;
         Node* dst_image = node_images.at(dst_node);
         int dst_slot = output.first.slot;
         int output_index = output.second;
 
-        graph_->AddEdge(recv, output_index, dst_image, dst_slot);
+        graph_->AddEdge(host_compute, output_index, dst_image, dst_slot);
       }
 
-      // Connect the control edge consumers in the subgraph to the _RecvFromHost
+      // Connect the control edge consumers in the subgraph to the _HostCompute
       // node.
       for (const auto& dst_node : oc_subgraph.control_outputs) {
         Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(recv, dst_image);
-      }
-
-      // Add a control edge in the subgraph so that the _SendToHost node, if
-      // any, is compiled before the _RecvFromHost node.
-      if (oc_subgraph.send_to_host != nullptr) {
-        graph_->AddControlEdge(oc_subgraph.send_to_host, recv);
+        graph_->AddControlEdge(host_compute, dst_image);
       }
     }
   }
@@ -882,6 +923,63 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
   return Status::OK();
 }
 
+Status Encapsulator::Subgraph::AddShapeInferenceInfo(
+    const string& outside_compilation_subgraph_name,
+    const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph) {
+  OutsideCompilationSubgraph& oc_subgraph =
+      outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
+
+  Node* host_compute = nullptr;
+  for (Node* n : graph_->nodes()) {
+    if (n->name() == oc_subgraph.host_compute_name) {
+      host_compute = n;
+      break;
+    }
+  }
+  if (host_compute == nullptr) {
+    return errors::InvalidArgument(
+        "After rewriting subgraph ", outside_compilation_subgraph_name,
+        " there is no HostCompute Op for outside compilation subgraph ",
+        oc_subgraph.host_compute_name);
+  }
+
+  if (inference_graph == nullptr) {
+    host_compute->AddAttr("shape_inference_graph", "");
+    host_compute->AddAttr("shapes", shapes);
+  } else {
+    string serialized_graph;
+    if (!inference_graph->SerializeToString(&serialized_graph)) {
+      return errors::Internal(
+          "Failed to serialize graph for outside compilation subgraph ",
+          oc_subgraph.host_compute_name);
+    }
+    host_compute->AddAttr("shape_inference_graph", serialized_graph);
+    host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::ReplaceFunctionDef(
+    FunctionLibraryDefinition* library) {
+  const string& name = call_node_def_.name();
+
+  FunctionDef fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(2) << "Replace function def " << name;
+    dump_graph::DumpGraphToFile(
+        strings::StrCat("replace_encapsulate_fdef_graph_", name), *graph_,
+        library);
+    dump_graph::DumpFunctionDefToFile(
+        strings::StrCat("replace_encapsulate_fdef_", name), fdef);
+  }
+
+  TF_RETURN_IF_ERROR(library->RemoveFunction(name));
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+  return Status::OK();
+}
+
 Status Encapsulator::Subgraph::BuildParallelCheckOp(
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
@@ -980,7 +1078,9 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
-  builder.Attr("dtypes", dtypes);
+  builder.Attr("Toutputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
   Status s = builder.Finalize(&recv_def);
   if (!s.ok()) return s;
 
@@ -1020,7 +1120,9 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
-  builder.Attr("dtypes", dtypes);
+  builder.Attr("Tinputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
   builder.Input(inputs);
   Status s = builder.Finalize(&send_def);
   if (!s.ok()) return s;
@@ -1062,6 +1164,13 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
   return Status::OK();
 }
 
+void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames(
+    std::vector<string>* names) const {
+  for (auto& entry : outside_compilation_subgraphs_) {
+    names->push_back(entry.first);
+  }
+}
+
 Status Encapsulator::GetFunctionNameAttr(
     Node const* node, string* attr, string* outside_compilation_attr) const {
   Status s = GetNodeAttr(node->attrs(), group_attribute_, attr);
@@ -1220,8 +1329,7 @@ Status Encapsulator::SplitIntoSubgraphs() {
   // single input and output node for it.
   for (auto& entry : subgraphs_) {
     Subgraph& subgraph = entry.second;
-    TF_RETURN_IF_ERROR(subgraph.AddSendsToOutsideCompilation(node_images));
-    TF_RETURN_IF_ERROR(subgraph.AddRecvsFromOutsideCompilation(node_images));
+    TF_RETURN_IF_ERROR(subgraph.AddHostComputes(entry.first, node_images));
   }
 
   MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
@@ -1509,8 +1617,346 @@ Status Encapsulator::AddEdgesToOutputGraph(
   return Status::OK();
 }
 
-Status Encapsulator::BuildOutputGraph(bool parallel_checking,
-                                      Graph* graph_out) {
+namespace {
+
+// Adds a dummy Const node to graph_out. The "constant" has the type of
+// data_type and the shape indicated in 'shape'. The dummy node is not a valid
+// Const node because it does not have any value defined, but this doesn't
+// matter because it will only be used subsequently for shape inference. (It
+// would be possible to add a switch statement over data_type to create a value
+// for the constant, but that would entail maintaining the logic as new types
+// are added, and is not necessary.)
+Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
+                         Graph* graph_out) {
+  TensorProto dummy_proto;
+  dummy_proto.set_dtype(data_type);
+  *dummy_proto.mutable_tensor_shape() = shape;
+  // Don't set any value field in the proto, since it is only going to be used
+  // for shape inference.
+
+  GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
+  NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
+                           options.op_registry());
+  node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
+  return options.FinalizeBuilder(&node_builder);
+}
+
+// Adds a copy of node_in to graph_out and adds the mapping to
+// copied_node_images.
+Status CopyShapeInferenceNodeToGraph(
+    Node* node_in, const Node* send_node,
+    const std::unordered_map<Node*, Node*>& dummy_node_images,
+    FunctionLibraryDefinition* library,
+    std::unordered_map<Node*, Node*>* copied_node_images, Graph* graph_out) {
+  // Once all the ancestor nodes have been added to graph_out, add this node
+  // and connect it to its ancestors.
+  Node* node_out = graph_out->CopyNode(node_in);
+  (*copied_node_images)[node_in] = node_out;
+  // Don't bother to build the shape inference graph if there's a node with no
+  // shape inference function, since it would just result in an error later at
+  // compile time.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(library->LookUp(node_in->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    return errors::InvalidArgument(
+        "Shape inference is not possible for outside_compilation "
+        "SendFromHost node ",
+        send_node->name(), " because it depends on node ", node_in->name(),
+        " which does not have a shape inference function registered.");
+  }
+  // Add all the edges to the newly copied node.
+  for (const Edge* in_edge : node_in->in_edges()) {
+    if (!in_edge->IsControlEdge()) {
+      Node* src = in_edge->src();
+      const auto iter = dummy_node_images.find(src);
+      if (iter == dummy_node_images.end()) {
+        // The src is a copied node so use the original output port.
+        graph_out->AddEdge((*copied_node_images)[in_edge->src()],
+                           in_edge->src_output(), node_out,
+                           in_edge->dst_input());
+      } else {
+        // The src is a dummy node so use output port 0.
+        graph_out->AddEdge(iter->second, 0, node_out, in_edge->dst_input());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
+    const Graph& graph_in, const ShapeRefiner& shape_refiner,
+    const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+    FunctionLibraryDefinition* library,
+    std::vector<TensorShapeProto>* static_shape_out,
+    std::unique_ptr<GraphDef>* graphdef_out) {
+  // Maps from nodes in graph_in to nodes in graph_out.
+  //
+  // When an edge has fully defined shape the source node in graph_in is
+  // replaced in graph_out by a dummy constant node. The mapping from nodes
+  // in graph_in to dummy nodes is stored in dummy_node_images.
+  //
+  // When a node in graph_in has at least one ancestor that doesn't have fully
+  // defined shape, it is copied into graph_out. The mapping from nodes in
+  // graph_in to copied nodes is stored in copied_node_images.
+  //
+  // The two types of node are treated differently because, when adding edges to
+  // graph_out, an output from a dummy node always uses port 0, whereas an
+  // output from a copied node uses the same port that was used in graph_in.
+  std::unordered_map<Node*, Node*> dummy_node_images;
+  std::unordered_map<Node*, Node*> copied_node_images;
+
+  std::unique_ptr<Graph> graph_out(new Graph(graph_in.op_registry()));
+  graph_out->set_versions(graph_in.versions());
+  static_shape_out->resize(send_node->num_inputs());
+
+  // We don't use the standard ReverseDFS because we want to cut off traversal
+  // whenever we find an output with fully defined shape.
+  // TODO(misard) make this work properly in the presence of control flow.
+  struct Work {
+    Node* node;
+    bool leave;  // Are we entering or leaving node?
+  };
+  std::vector<Work> stack({{send_node, false}});
+  std::vector<bool> visited(graph_in.num_node_ids(), false);
+  while (!stack.empty()) {
+    Work w = stack.back();
+    stack.pop_back();
+    Node* n = w.node;
+
+    if (w.leave) {
+      TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
+          n, send_node, dummy_node_images, library, &copied_node_images,
+          graph_out.get()));
+    } else {
+      if (visited[n->id()]) continue;
+      visited[n->id()] = true;
+
+      // Arrange to revisit when all done with all inputs.
+      stack.push_back(Work{n, true});
+
+      bool has_parent_with_unknown_shape = false;
+      for (const Edge* in_edge : n->in_edges()) {
+        if (!in_edge->IsControlEdge()) {
+          Node* src_node = in_edge->src();
+          int src_port = in_edge->src_output();
+          shape_inference::InferenceContext* context =
+              shape_refiner.GetContext(src_node);
+          shape_inference::ShapeHandle shape = context->output(src_port);
+          if (context->FullyDefined(shape)) {
+            // This ancestor has known shape, so instead of adding it to the
+            // stack, add a dummy node with that shape to graph_out and
+            // continue.
+            TensorShapeProto proto;
+            context->ShapeHandleToProto(shape, &proto);
+            dummy_node_images[src_node] = AddDummyShapedNode(
+                src_node->output_type(src_port), proto, graph_out.get());
+            if (n == send_node) {
+              (*static_shape_out)[in_edge->dst_input()] = proto;
+            }
+          } else {
+            if (!visited[src_node->id()]) {
+              has_parent_with_unknown_shape = true;
+              stack.push_back({src_node, false});
+            }
+          }
+        }
+      }
+      if (!has_parent_with_unknown_shape) {
+        if (n == send_node) {
+          // The shapes of all the inputs to send_node are statically known. We
+          // won't have to do any inference at compile time so return now: the
+          // shapes were stored in static_shape_out above.
+          graphdef_out->reset();
+          return Status::OK();
+        } else {
+          // Any shape that is being processed is either the original send node
+          // or has at least one output with statically-unknown shape. If the
+          // latter and it doesn't have any inputs with statically-unknown
+          // shape, then check that it is of the recv nodes that we can fill in
+          // the shape of at run-time later. If it isn't one of those, then we
+          // won't have any additional knowledge at compile time, so we already
+          // know we won't be able to do shape inference and we can return an
+          // error now.
+          if (recv_at_host_nodes.find(n->name()) == recv_at_host_nodes.end()) {
+            return errors::InvalidArgument(
+                "Shape inference is not possible for outside_compilation "
+                "SendFromHost node ",
+                send_node->name(), " because shape of node ", n->name(),
+                " will not be known at compilation time.");
+          }
+        }
+      }
+    }
+  }
+
+  graphdef_out->reset(new GraphDef());
+  graph_out->ToGraphDef(graphdef_out->get());
+
+  return Status::OK();
+}
+
+Status Encapsulator::MakePrunedGraphCopyAndInline(
+    const Graph& graph, const std::vector<Node*>& sink_nodes,
+    std::unique_ptr<Graph>* pruned_graph,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // First copy all ancestor nodes of sink_nodes into a new graph.
+  pruned_graph->reset(new Graph(library));
+  (*pruned_graph)->set_versions(graph.versions());
+  ReverseDFSFrom(graph, sink_nodes,
+                 /*enter=*/nullptr,
+                 /*leave=*/[&](Node* n) {
+                   if (!n->IsSource()) {
+                     Node* copied = (*pruned_graph)->CopyNode(n);
+                     node_images->emplace(n, copied);
+                   }
+                 });
+
+  // Add all the edges between copied nodes.
+  for (auto entry : *node_images) {
+    const Node* orig = entry.first;
+    Node* image = entry.second;
+    for (const Edge* out_edge : orig->out_edges()) {
+      auto iter = node_images->find(out_edge->dst());
+      if (iter != node_images->end()) {
+        // The source and destination are both in the copied graph.
+        (*pruned_graph)
+            ->AddEdge(image, out_edge->src_output(), iter->second,
+                      out_edge->dst_input());
+      }
+    }
+  }
+
+  // Find all the function call nodes, and inline them.
+  std::vector<Node*> function_nodes;
+  for (auto node : (*pruned_graph)->nodes()) {
+    const OpRegistrationData* op_reg_data;
+    TF_RETURN_IF_ERROR(library->LookUp(node->type_string(), &op_reg_data));
+    if (op_reg_data->is_function_op) {
+      function_nodes.push_back(node);
+    }
+  }
+  for (auto node : function_nodes) {
+    VLOG(2) << "Inlining function " << node->name();
+    const FunctionDef* fdef = library->Find(node->type_string());
+    if (fdef == nullptr) {
+      return errors::Internal("Failed to find function ", node->type_string(),
+                              " in function library.");
+    }
+    FunctionBody* fbody = nullptr;
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBodyHelper(*fdef, node->attrs(), library,
+                                [library](const string& op, const OpDef** sig) {
+                                  return library->LookUpOpDef(op, sig);
+                                },
+                                &fbody));
+    InlineFunctionBody(*library, pruned_graph->get(), node, fbody);
+    delete fbody;
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::MakeGraphForOutsideCompilationSends(
+    const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+    ShapeRefiner* shape_refiner,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // Find all the send_from_host nodes in all subgraphs, to use as roots for the
+  // pruning.
+  std::vector<Node*> send_from_host_nodes;
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      if (send_node != nullptr) {
+        send_from_host_nodes.push_back(send_node);
+      }
+    }
+  }
+
+  // Make a copy of all the graph nodes needed to evaluate the send_from_host
+  // nodes, inlining any functions as needed.
+  TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
+      graph, send_from_host_nodes, pruned_graph, node_images, library));
+
+  // Perform shape inference on the pruned graph.
+  shape_refiner->set_require_shape_inference_fns(false);
+  FixupSourceAndSinkEdges(pruned_graph->get());
+  std::vector<Node*> post_order;
+  GetReversePostOrder(*(*pruned_graph), &post_order);
+  for (auto node : post_order) {
+    // Ignore the status returned by the shape_refiner. At this point we want
+    // the best effort shapes, even if no shape function is registered for a
+    // node.
+    Status status = shape_refiner->AddNode(node);
+    if (!status.ok()) {
+      VLOG(1) << "Shape inference failed for node: " << status;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
+    Graph* graph_out, FunctionLibraryDefinition* library) {
+  std::unique_ptr<Graph> pruned_graph;
+  ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
+  std::unordered_map<const Node*, Node*> node_images;
+  TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
+      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
+
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    // Find all the recv_at_host nodes in this subgraph.
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    std::unordered_set<string> recv_at_host_names;
+    for (const auto& name : outside_compilation_names) {
+      Node* recv_node = subgraph.GetRecvAtHostNode(name);
+      if (recv_node != nullptr) {
+        recv_at_host_names.insert(recv_node->name());
+      }
+    }
+    // For each send_from_host node, do as much shape inference as possible
+    // without knowing the shape of the recv_at_host nodes, and store the
+    // result, along with enough information to complete the job at compile time
+    // once the recv_at_host shapes are known.
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      std::vector<TensorShapeProto> static_shape;
+      std::unique_ptr<GraphDef> graphdef;
+      if (send_node != nullptr) {
+        TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
+            *pruned_graph, shape_refiner, recv_at_host_names,
+            node_images[send_node], library, &static_shape, &graphdef));
+        if (graphdef == nullptr) {
+          VLOG(2) << "Send node  " << send_node->name() << " shapes";
+          for (int i = 0; i < static_shape.size(); ++i) {
+            VLOG(2) << static_shape[i].DebugString();
+          }
+        } else {
+          VLOG(2) << "Send node " << send_node->name() << " graph\n"
+                  << graphdef->DebugString();
+        }
+      }
+      TF_RETURN_IF_ERROR(
+          subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get()));
+    }
+    if (!outside_compilation_names.empty()) {
+      TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                                      FunctionLibraryDefinition* library) {
   // Map from nodes in the input graph to nodes in the output graph.
   std::unordered_map<const Node*, Node*> node_images;
 
@@ -1522,6 +1968,9 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
   TF_RETURN_IF_ERROR(
       AddEdgesToOutputGraph(node_images, parallel_checking, graph_out));
 
+  TF_RETURN_IF_ERROR(
+      GetShapeInfoForOutsideCompilationSends(graph_out, library));
+
   return Status::OK();
 }
 
@@ -1545,7 +1994,7 @@ Status EncapsulateSubgraphsInFunctions(
   std::unique_ptr<Graph> out(new Graph(library));
   out->set_versions(graph_in.versions());
   TF_RETURN_IF_ERROR(
-      encapsulator.BuildOutputGraph(parallel_checking, out.get()));
+      encapsulator.BuildOutputGraph(parallel_checking, out.get(), library));
 
   *graph_out = std::move(out);
   return Status::OK();
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index b100861d5e9..aed9cae0f17 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -29,17 +29,181 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
-                      string* diff) {
-  // TODO(phawkins) use a more sophisticated equality test.
-  if (a.DebugString() != b.DebugString()) {
+template <class Tkey, class Tvalue>
+bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
+                   const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
+                   const std::function<string(const Tkey&)>& key_to_string,
+                   const std::function<string(const Tvalue&)>& value_to_string,
+                   const std::function<bool(const Tkey&, const Tvalue&,
+                                            const Tvalue&)>& compare,
+                   const string& map_name, string* diff) {
+  for (const auto& elt_a : a) {
+    const auto iter = b.find(elt_a.first);
+    if (iter == b.end()) {
+      if (diff) {
+        *diff = strings::StrCat(
+            map_name, " expected: contains element with key '",
+            key_to_string(elt_a.first), "' got: map has no such element");
+      }
+      return false;
+    }
+    if (!compare(elt_a.first, elt_a.second, iter->second)) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " expected: element with key '",
+                                key_to_string(elt_a.first), " has value '",
+                                value_to_string(elt_a.second), "' got: '",
+                                value_to_string(iter->second), "'");
+      }
+      return false;
+    }
+  }
+  for (const auto& elt_b : b) {
+    const auto iter = a.find(elt_b.first);
+    if (iter == a.end()) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " got: contains element with key '",
+                                key_to_string(elt_b.first),
+                                "' expected: map has no such element");
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
+                          const string& diff_preamble, string* diff) {
+  if (a.op() != b.op()) {
     if (diff) {
-      *diff = strings::StrCat("Definition mismatch for function ",
-                              a.signature().name(), ", expected:\n",
-                              a.DebugString(), "\ngot:\n", b.DebugString());
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected op '", a.op(), "' got '", b.op());
     }
     return false;
   }
+  if (a.device() != b.device()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected device '", a.device(), "' got '",
+                              b.device());
+    }
+    return false;
+  }
+  if (a.input_size() != b.input_size()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected ", a.input_size(), " inputs got ",
+                              b.input_size(), " expected:\n", a.DebugString(),
+                              "\ngot:\n", b.DebugString());
+    }
+    return false;
+  }
+  for (int i = 0; i < a.input_size(); ++i) {
+    if (a.input(i) != b.input(i)) {
+      if (diff) {
+        *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                                " input ", i, ", expected ", a.input(i),
+                                " got ", b.input(i), " expected:\n",
+                                a.DebugString(), "\ngot:\n", b.DebugString());
+      }
+      return false;
+    }
+  }
+  return EqualProtoMap<string, AttrValue>(
+      a.attr(), b.attr(), [](const string& s) { return s; },
+      [](const AttrValue& v) { return v.DebugString(); },
+      [](const string& key, const AttrValue& av, const AttrValue& bv) {
+        if (key == "shape_inference_graph") {
+          // Default serialization of GraphDef is unstable because maps don't
+          // serialize deterministically. Rather than go through the hoops to
+          // turn on deterministic serialization of this attr just for this
+          // test, add logic here to compare determinstically.
+          GraphDef ga;
+          if (!ga.ParseFromString(av.s())) {
+            return false;
+          }
+          GraphDef gb;
+          if (!gb.ParseFromString(bv.s())) {
+            return false;
+          }
+          return EqualGraphDef(ga, gb, nullptr);
+        } else {
+          return av.DebugString() == bv.DebugString();
+        }
+      },
+      strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
+      diff);
+}
+
+bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
+                      string* diff) {
+  if (a.signature().DebugString() != b.signature().DebugString()) {
+    if (diff) {
+      *diff = strings::StrCat("Signature mismatch for function ",
+                              a.signature().name(), ", expected:\n",
+                              a.signature().DebugString(), "\ngot:\n",
+                              b.signature().DebugString());
+    }
+    return false;
+  }
+  if (!EqualProtoMap<string, AttrValue>(
+          a.attr(), b.attr(), [](const string& s) { return s; },
+          [](const AttrValue& v) { return v.DebugString(); },
+          [](const string& key, const AttrValue& av, const AttrValue& bv) {
+            return av.DebugString() == bv.DebugString();
+          },
+          strings::StrCat("attr mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  if (!EqualProtoMap<string, string>(
+          a.ret(), b.ret(), [](const string& s) { return s; },
+          [](const string& s) { return s; },
+          [](const string& key, const string& av, const string& bv) {
+            return av == bv;
+          },
+          strings::StrCat("ret mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  for (int i = 0; i < a.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < b.node_def_size(); ++j) {
+      if (a.node_def(i).name() == b.node_def(j).name()) {
+        if (!EqualFunctionNodeDef(
+                a.node_def(i), b.node_def(j),
+                strings::StrCat("Function ", a.signature().name()), diff)) {
+          return false;
+        }
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", expected: has node '", a.node_def(i).name(),
+                                "' got: no node of that name");
+      }
+      return false;
+    }
+  }
+  for (int i = 0; i < b.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < a.node_def_size(); ++j) {
+      if (b.node_def(i).name() == a.node_def(j).name()) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", got: has node '", b.node_def(i).name(),
+                                "' expected: no node of that name");
+      }
+      return false;
+    }
+  }
   return true;
 }
 
@@ -84,29 +248,64 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 
 // TODO(misard): remove these fake registrations once there are real Ops to be
 // compiled.
-REGISTER_OP("_XlaSendToHost")
-    .Input("input: dtypes")
-    .Attr("dtypes: list(type) >= 0");
-
-REGISTER_OP("_XlaRecvFromHost")
-    .Output("output: dtypes")
-    .Attr("dtypes: list(type) >= 0");
+REGISTER_OP("_XlaHostCompute")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaSendFromHost")
-    .Input("input: dtypes")
-    .Attr("dtypes: list(type) >= 0");
+    .Input("input: Tinputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("_XlaRecvAtHost")
-    .Output("output: dtypes")
-    .Attr("dtypes: list(type) >= 0");
+    .Output("output: Toutputs")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
-REGISTER_OP("InputTest").Output("o: float");
+REGISTER_OP("InputTest")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
 
-REGISTER_OP("UnaryTest").Input("a: float").Output("o: float");
+REGISTER_OP("InputTestShaped")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("UnaryTest")
+    .Input("a: float")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
 REGISTER_OP("BinaryTest")
     .Input("a: float")
     .Input("b: float")
-    .Output("o: float");
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
+REGISTER_OP("BinaryTest2")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("AddNLikeTest")
     .Input("inputs: N * T")
@@ -124,22 +323,48 @@ Node* Input(const GraphDefBuilder::Options& opts) {
   return ops::SourceOp("InputTest", opts);
 }
 
-Node* RecvAtHost(const gtl::ArraySlice<DataType>& dtypes,
+Node* InputShaped(const GraphDefBuilder::Options& opts) {
+  return ops::SourceOp("InputTestShaped", opts);
+}
+
+Node* KnownShape(const gtl::ArraySlice<int>& shape,
+                 const GraphDefBuilder::Options& opts) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const",
+                           opts.op_registry());
+  TensorProto value;
+  value.set_dtype(DT_FLOAT);
+  for (int dim : shape) {
+    value.mutable_tensor_shape()->add_dim()->set_size(dim);
+  }
+  return opts.WithAttr("value", value)
+      .WithAttr("dtype", DT_FLOAT)
+      .FinalizeBuilder(&node_builder);
+}
+
+Node* RecvAtHost(const string& key, const gtl::ArraySlice<DataType>& dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
-  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
+  return opts.WithAttr("Toutputs", dtypes)
+      .WithAttr("key", key)
+      .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(const std::vector<ops::NodeOut>& inputs,
-                   const gtl::ArraySlice<DataType>& dtypes,
+Node* SendFromHost(const string& key, const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
-  return opts.WithAttr("dtypes", dtypes).FinalizeBuilder(&node_builder);
+  std::vector<DataType> dtypes;
+  for (const auto& node : inputs) {
+    dtypes.push_back(node.dt);
+  }
+  return opts.WithAttr("key", key)
+      .WithAttr("Tinputs", dtypes)
+      .FinalizeBuilder(&node_builder);
 }
 
 Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) {
@@ -151,6 +376,11 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b,
   return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts);
 }
 
+Node* BinaryUnknownShape(ops::NodeOut a, ops::NodeOut b,
+                         const GraphDefBuilder::Options& opts) {
+  return ops::BinaryOp("BinaryTest2", std::move(a), std::move(b), opts);
+}
+
 Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
@@ -576,6 +806,21 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
@@ -584,19 +829,18 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
           {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "c:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"c"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -612,11 +856,11 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost({e}, {DT_FLOAT},
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
                               b2.opts()
                                   .WithName("outside_compilation_F1_O1_send")
                                   .WithControlInput(e));
@@ -674,37 +918,71 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected_1;
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape1.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape1.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape1_graph;
+    TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph));
+    EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1));
+  }
+
+  string shape_string_expected_2;
+  {
+    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     shape2.opts().WithName("E"));
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
+    SendFromHost("host_compute_channel_F1_O2", {h},
+                 shape2.opts().WithName("outside_compilation_F1_O2_send"));
+    GraphDef shape2_graph;
+    TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph));
+    EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2));
+  }
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
-          {{"I"}, "UnaryTest", {"outside_compilation_O2_recv:output:0"}},
+          {{"I"},
+           "UnaryTest",
+           {"outside_compilation_O2_host_compute:outputs:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O2_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O2_host_compute"},
+           "_XlaHostCompute",
            {"D:o:0", "F:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph", shape_string_expected_2},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"F"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected_1},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"D"}},
-          {{"outside_compilation_O2_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O2_send"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"i_0_retval", "I:o:0"}});
 
@@ -720,23 +998,24 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     Node* call = b2.opts().FinalizeBuilder(&node_builder);
 
     Node* recv1 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost({e}, {DT_FLOAT},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
 
     Node* recv2 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O2_recv"));
     Node* g = Binary(e, ops::NodeOut(recv2, 1),
                      b2.opts().WithName("G").WithControlInputs({recv2, e}));
     Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
-    Node* send2 = SendFromHost(
-        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O2_send"));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F1_O2", {h},
+                     b2.opts().WithName("outside_compilation_F1_O2_send"));
 
     Node* s = NoOp(b2.opts()
                        .WithName("F1_sequencer")
@@ -758,8 +1037,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
-    Node* b = Input(b1.opts().WithName("B"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = InputShaped(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
         Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
@@ -791,6 +1070,24 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"},
       {"f_0_retval:float", "d_0_retval:float"}, {},
@@ -799,19 +1096,18 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"C:o:0", "outside_compilation_O1_recv:output:0"},
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
            {},
-           {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"C:o:0", "D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
            {"D"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
 
@@ -822,16 +1118,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
           {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
           {{"I"},
            "BinaryTest",
-           {"f_0_arg", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"G:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F2_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -839,15 +1135,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
-    Node* b = Input(b2.opts().WithName("B"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = InputShaped(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost({DT_FLOAT, DT_FLOAT},
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
                    b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost({e}, {DT_FLOAT},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -857,12 +1153,14 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     Node* s1 = NoOp(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
 
-    Node* recv2 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_recv"));
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F2_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F2_O1_recv"));
     Node* h = Binary(ops::NodeOut(call1, 1), recv2,
                      b2.opts().WithName("H").WithControlInput(s1));
-    Node* send2 = SendFromHost(
-        {h}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_send"));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F2_O1", {h},
+                     b2.opts().WithName("outside_compilation_F2_O1_send"));
 
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
     node_builder2.Input(e).Input(call1);
@@ -888,7 +1186,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -908,6 +1206,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -915,11 +1216,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -927,12 +1233,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* e = Unary(a, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost(
-        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -954,7 +1261,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
 
   {
     GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
-    Node* a = Input(b1.opts().WithName("A"));
+    Node* a = InputShaped(b1.opts().WithName("A"));
     Node* b = Input(b1.opts().WithName("B"));
     Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
     Node* d =
@@ -975,6 +1282,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
       {
@@ -982,17 +1292,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"},
            "BinaryTest",
-           {"D:o:0", "outside_compilation_O1_recv:output:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({})}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
            {"D"}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
-           {},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}},
-           {"outside_compilation_O1_send"}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1000,14 +1310,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
         new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
-    Node* a = Input(b2.opts().WithName("A"));
+    Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
     Node* recv1 =
-        RecvAtHost({}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+        RecvAtHost("host_compute_channel_F1_O1", {},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
-    Node* send1 = SendFromHost(
-        {e}, {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1055,10 +1367,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
            {"D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1069,8 +1385,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
@@ -1118,16 +1435,19 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
-          {{"F"}, "UnaryTest", {"D:o:0"}, {}, {"outside_compilation_O1_recv"}},
-          {{"outside_compilation_O1_send"},
-           "_XlaSendToHost",
+          {{"F"},
+           "UnaryTest",
            {"D:o:0"},
-           {{"dtypes", gtl::ArraySlice<DataType>({DT_FLOAT})}}},
-          {{"outside_compilation_O1_recv"},
-           "_XlaRecvFromHost",
            {},
-           {{"dtypes", gtl::ArraySlice<DataType>({})}},
-           {"outside_compilation_O1_send"}},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1138,10 +1458,11 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* recv1 = RecvAtHost(
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
     Node* e = Unary(recv1, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost({}, {},
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {},
                                b2.opts()
                                    .WithName("outside_compilation_F1_O1_send")
                                    .WithControlInput(e));
@@ -1215,5 +1536,110 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
 
+// Test for shape inference of outside compilation.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    *library.add_function() = test::function::XTimesTwo();
+
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    // Give nodes 'c' and 'd' names that collide after lowercasing.
+    Node* c = Unary(a, b1.opts().WithName("C"));
+    Node* d = Unary(b, b1.opts().WithName("c").WithControlInput(c).WithAttr(
+                           "_encapsulate", "F1"));
+    Node* e = BinaryUnknownShape(c, d,
+                                 b1.opts()
+                                     .WithName("E")
+                                     .WithControlInputs({b, d})
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Binary(a, f, b1.opts().WithName("G").WithControlInput(e));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_0"));
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  *library_expected.add_function() = test::function::XTimesTwo();
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"c"}, "UnaryTest", {"b_0_arg"}, {}, {}},
+          {{"F"},
+           "BinaryTest",
+           {"c_0_arg", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"c:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"c"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+    Node* c = Unary(a, b2.opts().WithName("C"));
+
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(b).Input(c);
+    Node* call =
+        b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder);
+
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(
+        c, ops::NodeOut(recv, 0),
+        b2.opts().WithName("E").WithControlInputs({recv, b}));
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
+                              b2.opts()
+                                  .WithName("outside_compilation_F1_O1_send")
+                                  .WithControlInput(e));
+
+    Node* s = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
+
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 4842877d9af..6353149e4af 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -45,7 +45,7 @@ namespace tensorflow {
 // see comment on `AllowsAsynchronousDeallocation()`.
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
-  XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context);
+  XlaAllocator(const gpu::Platform* platform, OpKernelContext* op_context);
   ~XlaAllocator() override;
   xla::StatusOr<gpu::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
                                                 bool retry_on_failure) override;
@@ -79,7 +79,8 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
   std::unordered_map<void*, Tensor> tensors_;
 };
 
-XlaAllocator::XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context)
+XlaAllocator::XlaAllocator(const gpu::Platform* platform,
+                           OpKernelContext* op_context)
     : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
 
 XlaAllocator::~XlaAllocator() = default;
@@ -248,12 +249,16 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
+  // Builds an XLA allocator for the device.
+  XlaAllocator xla_allocator(client->platform(), ctx);
+
   XlaCompiler::Options options;
   options.client = client;
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.device_allocator = &xla_allocator;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -264,9 +269,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   VLOG(1) << "Executing XLA Computation...";
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
-
   std::unique_ptr<xla::ShapedBuffer> output;
   // Build xla::ShapedBuffers that point directly to the Tensor buffers.
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
@@ -374,8 +376,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES(ctx,
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
-    TensorShape write_shape;
-    OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(write.shape, &write_shape));
 
     gpu::DeviceMemoryBase buffer = output->buffer({output_num});
 
@@ -397,7 +397,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
     // Looks up the owning Tensor by buffer address.
     OP_REQUIRES_OK(
-        ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write_shape,
+        ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write.shape,
                                                 variable->tensor()));
     ++output_num;
   }
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index bfff52c55a7..6d854a920eb 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -148,8 +148,7 @@ Status BuildArguments(int num_constant_args,
     XlaCompiler::Argument& arg = (*args)[input_num];
     arg.kind = XlaCompiler::Argument::kConstant;
     arg.type = input.dtype();
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
+    arg.shape = input.shape();
     arg.constant_value = input;
     ++input_num;
   }
@@ -170,8 +169,7 @@ Status BuildArguments(int num_constant_args,
       arg.constant_value = input;
     }
     arg.type = input.dtype();
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
+    arg.shape = input.shape();
     ++input_num;
   }
 
@@ -189,8 +187,7 @@ Status BuildArguments(int num_constant_args,
     if (variable_args[variable_id].present) {
       const Tensor& value = variable_args[variable_id].value;
       arg.type = value.dtype();
-      TF_RETURN_IF_ERROR(
-          TensorShapeToXLAShape(value.dtype(), value.shape(), &arg.shape));
+      arg.shape = value.shape();
       arg.initialized = true;
     } else {
       // The values of uninitialized variables are not passed as inputs, since
@@ -199,7 +196,7 @@ Status BuildArguments(int num_constant_args,
       // uninitialized variables.
       arg.initialized = false;
       arg.type = DT_INVALID;
-      arg.shape = xla::Shape();
+      arg.shape = TensorShape();
     }
     ++input_num;
   }
@@ -223,6 +220,7 @@ Status XlaCompilationCache::BuildExecutable(
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client_->default_device_ordinal());
   build_options.set_result_layout(result.xla_output_shape);
+  build_options.set_device_allocator(options.device_allocator);
 
   auto compile_result =
       client_->Compile(*result.computation, argument_layouts, build_options);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 314f5506b16..b0b038775f7 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -144,6 +144,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_triangular_solve_op_test",
+    size = "small",
+    srcs = ["matrix_triangular_solve_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "clustering_test",
     size = "small",
@@ -240,6 +255,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "extract_image_patches_op_test",
+    size = "small",
+    srcs = ["extract_image_patches_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "fft_test",
     size = "medium",
@@ -326,6 +353,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_band_part_test",
+    size = "medium",
+    srcs = ["matrix_band_part_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "momentum_test",
     size = "small",
@@ -437,6 +477,18 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reverse_sequence_op_test",
+    size = "small",
+    srcs = ["reverse_sequence_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "rmsprop_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index c95fb1c5152..30a6d3a74d6 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1181,6 +1181,50 @@ class BinaryOpsTest(XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
+  def testMatrixSetDiag(self):
+    for dtype in self.numeric_types:
+      # Square
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]],
+                   dtype=dtype),
+          np.array([1.0, 2.0, 3.0], dtype=dtype),
+          expected=np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]],
+                            dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]],
+                   dtype=dtype),
+          np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]], dtype=dtype),
+          expected=np.array(
+              [[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], [1.0, 0.0, -3.0]],
+               [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]],
+              dtype=dtype))
+
+      # Rectangular
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]], dtype=dtype),
+          np.array([[-1.0, -2.0], [-4.0, -5.0]],
+                   dtype=dtype),
+          expected=np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
+                             [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]],
+                            dtype=dtype))
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py
new file mode 100644
index 00000000000..0361702e7af
--- /dev/null
+++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for ExtractImagePatches op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ExtractImagePatches(XLATestCase):
+  """Functional tests for ExtractImagePatches op."""
+
+  def _VerifyValues(self, image, ksizes, strides, rates, padding, patches):
+    """Tests input-output pairs for the ExtractImagePatches op.
+
+    Args:
+      image: Input tensor with shape: [batch, in_rows, in_cols, depth].
+      ksizes: Patch size specified as: [ksize_rows, ksize_cols].
+      strides: Output strides, specified as [stride_rows, stride_cols].
+      rates: Atrous rates, specified as [rate_rows, rate_cols].
+      padding: Padding type.
+      patches: Expected output.
+    """
+    ksizes = [1] + ksizes + [1]
+    strides = [1] + strides + [1]
+    rates = [1] + rates + [1]
+
+    with self.test_session():
+      image_placeholder = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        out_tensor = array_ops.extract_image_patches(
+            image_placeholder,
+            ksizes=ksizes,
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name="im2col")
+      feed_dict = {image_placeholder: image}
+      self.assertAllClose(patches, out_tensor.eval(feed_dict=feed_dict))
+
+  def testKsize1x1Stride1x1Rate1x1(self):
+    """Verifies that for 1x1 kernel the output equals the input."""
+    # [2, 3, 4, 5]
+    image = np.reshape(range(120), [2, 3, 4, 5])
+    # [2, 3, 4, 5]
+    patches = np.reshape(range(120), [2, 3, 4, 5])
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[1, 1],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize1x1Stride2x3Rate1x1(self):
+    """Test for 1x1 kernel and strides."""
+    # [2, 4, 5, 3]
+    image = np.reshape(range(120), [2, 4, 5, 3])
+    # [2, 2, 2, 3]
+    patches = image[:, ::2, ::3, :]
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[2, 3],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Valid(self):
+    """Test for 2x2 kernel with VALID padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 1, 1, 4]
+    patches = [[[[1, 2, 3, 4]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="VALID",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Same(self):
+    """Test for 2x2 kernel with SAME padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 2, 2, 4]
+    patches = [[[[1, 2, 3, 4], [2, 0, 4, 0]], [[3, 4, 0, 0], [4, 0, 0, 0]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="SAME",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py
new file mode 100644
index 00000000000..29394f9ea51
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_band_part_test.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class MatrixBandPartTest(XLATestCase):
+
+  def _testMatrixBandPart(self, dtype, shape):
+    with self.test_session():
+      batch_shape = shape[:-2]
+      mat = np.ones(shape).astype(dtype)
+      batch_mat = np.tile(mat, batch_shape + [1, 1])
+      for lower in -1, 0, 1, shape[-2] - 1:
+        for upper in -1, 0, 1, shape[-1] - 1:
+          band_np = mat
+          if lower >= 0:
+            band_np = np.triu(band_np, -lower)
+          if upper >= 0:
+            band_np = np.tril(band_np, upper)
+          if batch_shape:
+            band_np = np.tile(band_np, batch_shape + [1, 1])
+
+          placeholder = array_ops.placeholder(dtype)
+          with self.test_scope():
+            band = array_ops.matrix_band_part(
+                placeholder,
+                constant_op.constant(lower, dtype=dtypes.int32),
+                constant_op.constant(upper, dtype=dtypes.int32))
+            feed_dict = {placeholder: batch_mat}
+            self.assertAllEqual(band_np, band.eval(feed_dict=feed_dict))
+
+  def testMatrixBandPart(self):
+    for dtype in self.float_types:
+      for batch_shape in [[], [2,], [1, 3, 2]]:
+        for rows in 1, 2, 7:
+          for cols in 1, 2, 7:
+            self._testMatrixBandPart(dtype, batch_shape + [rows, cols])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
new file mode 100644
index 00000000000..cccb7f5789d
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.MatrixTriangularSolve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def MakePlaceholder(x):
+  return array_ops.placeholder(dtypes.as_dtype(x.dtype), shape=x.shape)
+
+
+class MatrixTriangularSolveOpTest(XLATestCase):
+
+  def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
+                                 placeholder_b, a, clean_a, b, verification,
+                                 atol):
+    feed_dict = {placeholder_a: a, placeholder_ca: clean_a, placeholder_b: b}
+    verification_np = sess.run(verification, feed_dict)
+    self.assertAllClose(b, verification_np, atol=atol)
+
+  def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol):
+    clean_a = np.tril(a) if lower else np.triu(a)
+    with self.test_session() as sess:
+      placeholder_a = MakePlaceholder(a)
+      placeholder_ca = MakePlaceholder(clean_a)
+      placeholder_b = MakePlaceholder(b)
+      with self.test_scope():
+        x = linalg_ops.matrix_triangular_solve(
+            placeholder_a, placeholder_b, lower=lower, adjoint=adjoint)
+      verification = math_ops.matmul(placeholder_ca, x, adjoint_a=adjoint)
+      self._VerifyTriangularSolveBase(sess, placeholder_a, placeholder_ca,
+                                      placeholder_b, a, clean_a, b,
+                                      verification, atol)
+
+  def _VerifyTriangularSolveCombo(self, a, b, atol=1e-4):
+    transp = lambda x: np.swapaxes(x, -1, -2)
+    for lower, adjoint in itertools.product([True, False], repeat=2):
+      self._VerifyTriangularSolve(
+          a if lower else transp(a), b, lower, adjoint, atol)
+
+  def testBasic(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5))
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicNotActuallyTriangular(self):
+    rng = np.random.RandomState(0)
+    a = rng.randn(5, 5)  # the `a` matrix is not lower-triangular
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicComplexDtypes(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5) + rng.randn(5, 5) * 1j)
+    b = rng.randn(5, 7) + rng.randn(5, 7) * 1j
+    for dtype in self.complex_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBatch(self):
+    rng = np.random.RandomState(0)
+    shapes = [((4, 3, 3), (4, 3, 5)), ((1, 2, 2), (1, 2, 1)),
+              ((1, 1, 1), (1, 1, 2)), ((2, 3, 4, 4), (2, 3, 4, 1))]
+    tuples = itertools.product(self.float_types, shapes)
+    for dtype, (a_shape, b_shape) in tuples:
+      n = a_shape[-1]
+      a = np.tril(rng.rand(*a_shape) - 0.5) / (2.0 * n) + np.eye(n)
+      b = rng.randn(*b_shape)
+      self._VerifyTriangularSolveCombo(
+          a.astype(dtype), b.astype(dtype), atol=1e-3)
+
+  def testLarge(self):
+    n = 1024
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.rand(n, n) - 0.5) / (2.0 * n) + np.eye(n)
+    b = rng.randn(n, n)
+    self._VerifyTriangularSolve(
+        a.astype(np.float32), b.astype(np.float32), True, False, 1e-4)
+
+  def testNonSquareCoefficientMatrix(self):
+    rng = np.random.RandomState(0)
+    for dtype in self.float_types:
+      a = rng.randn(3, 4).astype(dtype)
+      b = rng.randn(4, 4).astype(dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+
+  def testWrongDimensions(self):
+    randn = np.random.RandomState(0).randn
+    for dtype in self.float_types:
+      lhs = constant_op.constant(randn(3, 3), dtype=dtype)
+      rhs = constant_op.constant(randn(4, 3), dtype=dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/reverse_sequence_op_test.py b/tensorflow/compiler/tests/reverse_sequence_op_test.py
new file mode 100644
index 00000000000..1a5d05094e5
--- /dev/null
+++ b/tensorflow/compiler/tests/reverse_sequence_op_test.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.reverse_sequence_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ReverseSequenceTest(XLATestCase):
+
+  def _testReverseSequence(self,
+                           x,
+                           batch_axis,
+                           seq_axis,
+                           seq_lengths,
+                           truth,
+                           expected_err_re=None):
+    with self.test_session():
+      p = array_ops.placeholder(dtypes.as_dtype(x.dtype))
+      lengths = array_ops.placeholder(dtypes.as_dtype(seq_lengths.dtype))
+      with self.test_scope():
+        ans = array_ops.reverse_sequence(
+            p, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=lengths)
+      if expected_err_re is None:
+        tf_ans = ans.eval(feed_dict={p: x, lengths: seq_lengths})
+        self.assertAllClose(tf_ans, truth, atol=1e-10)
+      else:
+        with self.assertRaisesOpError(expected_err_re):
+          ans.eval(feed_dict={p: x, lengths: seq_lengths})
+
+  def testSimple(self):
+    x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
+    expected = np.array([[1, 2, 3], [6, 5, 4], [8, 7, 9]], dtype=np.int32)
+    self._testReverseSequence(
+        x,
+        batch_axis=0,
+        seq_axis=1,
+        seq_lengths=np.array([1, 3, 2], np.int32),
+        truth=expected)
+
+  def _testBasic(self, dtype, len_dtype):
+    x = np.asarray(
+        [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
+         [[17, 18, 19, 20], [21, 22, 23, 24]]],
+        dtype=dtype)
+    x = x.reshape(3, 2, 4, 1, 1)
+    x = x.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    # reverse dim 2 up to (0:3, none, 0:4) along dim=0
+    seq_lengths = np.asarray([3, 0, 4], dtype=len_dtype)
+
+    truth_orig = np.asarray(
+        [
+            [[3, 2, 1, 4], [7, 6, 5, 8]],  # reverse 0:3
+            [[9, 10, 11, 12], [13, 14, 15, 16]],  # reverse none
+            [[20, 19, 18, 17], [24, 23, 22, 21]]
+        ],  # reverse 0:4 (all)
+        dtype=dtype)
+    truth_orig = truth_orig.reshape(3, 2, 4, 1, 1)
+    truth = truth_orig.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    seq_axis = 0  # permute seq_axis and batch_axis (originally 2 and 0, resp.)
+    batch_axis = 2
+    self._testReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth)
+
+  def testSeqLength(self):
+    for dtype in self.all_types:
+      for seq_dtype in self.int_types:
+        self._testBasic(dtype, seq_dtype)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 8e4b8a38336..3d3e112f482 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -154,6 +154,21 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      x = np.arange(-0.90, 0.90, 0.25)
+      self._assertOpOutputMatchesExpected(
+          math_ops.acos,
+          x.astype(dtype),
+          expected=np.arccos(x).astype(dtype))
+      self._assertOpOutputMatchesExpected(
+          math_ops.asin,
+          x.astype(dtype),
+          expected=np.arcsin(x).astype(dtype))
+      x = np.arange(-3, 3).reshape(1, 3, 2)
+      self._assertOpOutputMatchesExpected(
+          math_ops.atan,
+          x.astype(dtype),
+          expected=np.arctan(x).astype(dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.acosh,
           np.array([1, 2, 3, 4], dtype=dtype),
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 1d9e0fb33ee..bf304102ede 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -427,16 +427,36 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
       //   identity nodes are values used by the loop body or condition.
       //   The Identity node may have the wrong device so copy the device from
       //   one of its outputs instead.
+      std::deque<const Edge*> possible_exit;
       for (const Edge* edge : arg.switch_node->out_edges()) {
-        if (edge->src_output() == 0 && IsExit(edge->dst())) {
+        if (edge->src_output() == 0) {
+          possible_exit.push_back(edge);
+        }
+        if (IsIdentity(edge->dst())) {
+          TF_RETURN_IF_ERROR(
+              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
+        }
+      }
+      // TODO(b/67425339): Allow general graph between switch and exit.
+      while (!possible_exit.empty()) {
+        const Edge* edge = possible_exit.front();
+        possible_exit.pop_front();
+        if (IsExit(edge->dst())) {
           if (arg.exit != nullptr) {
             return errors::InvalidArgument("Duplicate Exit successors to ",
                                            arg.switch_node->name());
           }
           arg.exit = edge->dst();
-        } else if (StringPiece(edge->dst()->type_string()) == "Identity") {
-          TF_RETURN_IF_ERROR(
-              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
+        } else {
+          if (!IsIdentity(edge->dst())) {
+            return errors::Unimplemented("General graph between switch (",
+                                         arg.switch_node->name(),
+                                         ") and exit node of frame ",
+                                         frame->name, " not supported yet.");
+          }
+          for (const Edge* out : edge->dst()->out_edges()) {
+            possible_exit.push_back(out);
+          }
         }
       }
     }
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
index 82b3b46a2f1..91351421bca 100644
--- a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -6,6 +6,9 @@ Operator                              | Type Constraint
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
 `All`                                 | `Tidx={int32,int64}`
 `Angle`                               | `Tout={double,float}`<br>`T={complex64}`
 `Any`                                 | `Tidx={int32,int64}`
@@ -34,7 +37,7 @@ Operator                              | Type Constraint
 `BroadcastGradientArgs`               | `T={int32,int64}`
 `Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Ceil`                                | `T={double,float}`
-`Cholesky`                            | `T={complex64,double,float}`
+`Cholesky`                            | `T={double,float}`
 `Complex`                             | `Tout={complex64}`<br>`T={double,float}`
 `ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
 `Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -68,7 +71,11 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
-`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
 `FloorMod`                            | `T={double,float,int32,int64}`
@@ -80,6 +87,13 @@ Operator                              | Type Constraint
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
 `Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Imag`                                | `Tout={double,float}`<br>`T={complex64}`
@@ -105,11 +119,14 @@ Operator                              | Type Constraint
 `MatMul`                              | `T={complex64,double,float}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
 `Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -131,6 +148,10 @@ Operator                              | Type Constraint
 `PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
 `RandomStandardNormal`                | `dtype={float}`
 `RandomUniform`                       | `T={int32,int64}`<br>`dtype={double,float}`
 `RandomUniformInt`                    | `T={int32,int64}`<br>`Tout={int32,int64}`
@@ -146,6 +167,8 @@ Operator                              | Type Constraint
 `Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
 `ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
 `Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
 `ResourceApplyAdagrad`                | `T={double,float}`
 `ResourceApplyAdam`                   | `T={double,float}`
 `ResourceApplyFtrl`                   | `T={double,float}`
@@ -156,6 +179,7 @@ Operator                              | Type Constraint
 `ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
 `RightShift`                          | `T={int32,int64,uint32,uint64}`
 `Rint`                                | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
index d4b7621ad28..b9bdb829d77 100644
--- a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -6,6 +6,9 @@ Operator                              | Type Constraint
 `Acosh`                               | `T={complex64,double,float}`
 `Add`                                 | `T={complex64,double,float,int32,int64}`
 `AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
 `All`                                 | `Tidx={int32,int64}`
 `Angle`                               | `Tout={double,float}`<br>`T={complex64}`
 `Any`                                 | `Tidx={int32,int64}`
@@ -34,7 +37,7 @@ Operator                              | Type Constraint
 `BroadcastGradientArgs`               | `T={int32,int64}`
 `Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Ceil`                                | `T={double,float}`
-`Cholesky`                            | `T={complex64,double,float}`
+`Cholesky`                            | `T={double,float}`
 `Complex`                             | `Tout={complex64}`<br>`T={double,float}`
 `ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
 `Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -68,7 +71,11 @@ Operator                              | Type Constraint
 `Exp`                                 | `T={complex64,double,float}`
 `ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Expm1`                               | `T={complex64,double,float}`
-`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Floor`                               | `T={double,float}`
 `FloorDiv`                            | `T={complex64,double,float,int32,int64}`
 `FloorMod`                            | `T={double,float,int32,int64}`
@@ -80,6 +87,13 @@ Operator                              | Type Constraint
 `GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
 `GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
 `Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Imag`                                | `Tout={double,float}`<br>`T={complex64}`
@@ -105,11 +119,14 @@ Operator                              | Type Constraint
 `MatMul`                              | `T={complex64,double,float}`
 `MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
 `Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `MaxPool`                             | `T={double,float,int32,int64}`
 `MaxPool3D`                           | `T={float}`
 `MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
 `MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
 `Maximum`                             | `T={double,float,int32,int64}`
 `Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
@@ -131,6 +148,10 @@ Operator                              | Type Constraint
 `PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
 `QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
 `Range`                               | `Tidx={double,float,int32,int64}`
 `Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
@@ -143,6 +164,8 @@ Operator                              | Type Constraint
 `Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
 `ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
 `Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
 `ResourceApplyAdagrad`                | `T={double,float}`
 `ResourceApplyAdam`                   | `T={double,float}`
 `ResourceApplyFtrl`                   | `T={double,float}`
@@ -153,6 +176,7 @@ Operator                              | Type Constraint
 `ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
 `ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
 `ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
 `RightShift`                          | `T={int32,int64,uint32,uint64}`
 `Rint`                                | `T={double,float}`
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 02215b5112d..1418d95956e 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -60,9 +60,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
   for (int i = 0; i < args->size(); ++i) {
     XlaCompiler::Argument& arg = (*args)[i];
     arg.type = ctx->input_type(i);
-
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape));
+    arg.shape = ctx->InputShape(i);
 
     if (arg.type == DT_RESOURCE) {
       return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 5e1b01878b7..e9be6f8476d 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -31,6 +31,7 @@ tf_kernel_library(
         "diag_op.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
+        "extract_image_patches_op.cc",
         "fft_ops.cc",
         "fill_op.cc",
         "function_ops.cc",
@@ -43,6 +44,9 @@ tf_kernel_library(
         "l2loss_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
+        "matrix_band_part_op.cc",
+        "matrix_set_diag_op.cc",
+        "matrix_triangular_solve_op.cc",
         "mirror_pad_op.cc",
         "no_op.cc",
         "one_hot_op.cc",
@@ -58,6 +62,7 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "reverse_sequence_op.cc",
         "scan_ops.cc",
         "segment_reduction_ops.cc",
         "select_op.cc",
@@ -92,6 +97,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
+        "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
         "//tensorflow/compiler/xla:array4d",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index a015b8e0e89..b0ba25b9983 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -28,8 +28,9 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result =
-        BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), adj_x_, adj_y_);
+    auto result = BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1),
+                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
+                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
     OP_REQUIRES_OK(ctx, result.status());
     ctx->SetOutput(0, result.ValueOrDie());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index 87d858f7635..fe6651793dc 100644
--- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -33,7 +33,7 @@ class CholeskyOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Cholesky"), CholeskyOp);
+REGISTER_XLA_OP(Name("Cholesky").TypeConstraint("T", kFloatTypes), CholeskyOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
new file mode 100644
index 00000000000..b2970eae20a
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -0,0 +1,169 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+class ExtractImagePatchesOp : public XlaOpKernel {
+ public:
+  explicit ExtractImagePatchesOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksizes", &ksizes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("rates", &dilations_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorFormat data_format = FORMAT_NHWC;
+    const int num_dims = ksizes_.size();
+
+    OP_REQUIRES(
+        ctx, num_dims >= 3,
+        errors::InvalidArgument("Kernel size must have at least 3 dimensions"));
+    const int num_spatial_dims = num_dims - 2;
+
+    OP_REQUIRES(ctx, strides_.size() == num_dims,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+    OP_REQUIRES(ctx, dilations_.size() == num_dims,
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+
+    int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
+    int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format);
+    OP_REQUIRES(
+        ctx, ksizes_[batch_dim] == 1 && ksizes_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "kernel sizes > 1 in the batch and depth "
+                              "dimensions."));
+    OP_REQUIRES(
+        ctx, strides_[batch_dim] == 1 && strides_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not support "
+                              "dilations in the batch and depth dimensions."));
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      OP_REQUIRES(
+          ctx, ksizes_[input_dim] >= 0,
+          errors::Unimplemented("Kernel size values must be non-negative; ", i,
+                                "th spatial dimension had dilation ",
+                                dilations_[input_dim]));
+      OP_REQUIRES(ctx, strides_[input_dim] >= 1,
+                  errors::Unimplemented("Stride values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+      OP_REQUIRES(ctx, dilations_[input_dim] >= 1,
+                  errors::Unimplemented("Dilation values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+    }
+
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(0), &type));
+
+    const TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == num_dims,
+        errors::InvalidArgument("input must be ", num_dims, "-dimensional",
+                                input_shape.DebugString()));
+    const int64 depth = input_shape.dim_size(feature_dim);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    // The following code is equivalent to:
+    // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD])
+    int64 kernel_size = 1;
+    std::vector<int64> lhs_shape(num_dims, 1);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      lhs_shape[i] = ksizes_[input_dim];
+      kernel_size *= ksizes_[input_dim];
+    }
+    lhs_shape[num_spatial_dims] = depth;
+    lhs_shape[num_spatial_dims + 1] = 1;
+
+    // Builds an identity matrix as a broadcast equality of iotas.
+    // iota = np.arange(np.prod(ksize), depth)
+    // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32)
+    xla::ComputationDataHandle iota;
+    TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
+                                 kernel_size * depth, &iota));
+
+    auto lhs = builder->Reshape(iota, lhs_shape);
+    auto filter = builder->ConvertElementType(
+        builder->Eq(lhs, iota, {num_spatial_dims + 1}), type);
+
+    xla::ConvolutionDimensionNumbers dims;
+    std::vector<int64> window_strides(num_spatial_dims);
+    std::vector<int64> lhs_dilation(num_spatial_dims, 1);
+    std::vector<int64> rhs_dilation(num_spatial_dims);
+    std::vector<std::pair<int64, int64>> padding(num_spatial_dims);
+
+    dims.set_input_batch_dimension(batch_dim);
+    dims.set_output_batch_dimension(batch_dim);
+    dims.set_input_feature_dimension(feature_dim);
+    dims.set_output_feature_dimension(feature_dim);
+    dims.set_kernel_input_feature_dimension(num_spatial_dims);
+    dims.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      const int64 dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      dims.add_input_spatial_dimensions(dim);
+      dims.add_kernel_spatial_dimensions(i);
+      dims.add_output_spatial_dimensions(dim);
+      window_strides[i] = strides_.at(dim);
+      rhs_dilation[i] = dilations_.at(dim);
+
+      int64 unused_output_size;
+      OP_REQUIRES_OK(
+          ctx, GetWindowedOutputSizeVerboseV2(
+                   input_shape.dim_size(dim), ksizes_[dim], rhs_dilation[i],
+                   window_strides[i], padding_, &unused_output_size,
+                   &padding[i].first, &padding[i].second));
+    }
+
+    xla::ComputationDataHandle conv =
+        builder->ConvGeneralDilated(ctx->Input(0), filter, window_strides,
+                                    padding, lhs_dilation, rhs_dilation, dims);
+    ctx->SetOutput(0, conv);
+  }
+
+ protected:
+  std::vector<int32> ksizes_;
+  std::vector<int32> dilations_;
+  std::vector<int32> strides_;
+  Padding padding_;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp);
+};
+
+REGISTER_XLA_OP(Name("ExtractImagePatches"), ExtractImagePatchesOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
new file mode 100644
index 00000000000..faa415a97b0
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixBandPartOp : public XlaOpKernel {
+ public:
+  explicit MatrixBandPartOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    const TensorShape num_lower_in_shape = context->InputShape(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in_shape),
+                errors::InvalidArgument("num_lower must be scalar, got shape ",
+                                        num_lower_in_shape.DebugString()));
+
+    const TensorShape num_upper_in_shape = context->InputShape(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in_shape),
+                errors::InvalidArgument("num_upper must be scalar, got shape ",
+                                        num_upper_in_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle num_lower = context->Input(1);
+    xla::ComputationDataHandle num_upper = context->Input(2);
+    DataType input_type = context->input_type(0);
+    DataType index_type = context->input_type(1);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+    const int64 m = input_shape.dim_size(input_shape.dims() - 2);
+    const int64 n = input_shape.dim_size(input_shape.dims() - 1);
+
+    // Compute 'offset', which is how many diagonals we are above/below the
+    // diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, m, &iota_m));
+
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, n, &iota_n));
+
+    auto offset = builder->Sub(builder->Broadcast(iota_n, {m}), iota_m,
+                               /*broadcast_dimensions=*/{0});
+
+    // If num_lower or num_upper are negative, include all lower/upper
+    // diagonals.
+    auto zero_index = XlaHelpers::Zero(builder, index_type);
+    num_lower = builder->Select(
+        builder->Lt(num_lower, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, m), num_lower);
+    num_upper = builder->Select(
+        builder->Lt(num_upper, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, n), num_upper);
+
+    auto indicator = builder->And(builder->Le(builder->Neg(num_lower), offset),
+                                  builder->Le(offset, num_upper));
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    auto zero_input = XlaHelpers::Zero(builder, input_type);
+    auto output = builder->Select(
+        indicator, input,
+        builder->Broadcast(zero_input, input_shape.dim_sizes()));
+
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixBandPartOp);
+};
+REGISTER_XLA_OP(Name("MatrixBandPart"), MatrixBandPartOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
new file mode 100644
index 00000000000..b2940bdcff7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+class MatrixSetDiagOp : public XlaOpKernel {
+ public:
+  explicit MatrixSetDiagOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape diag_shape = context->InputShape(1);
+
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    // Check to make sure the last dimension of diag is equal to the smaller of
+    // the last two dimensions of input.
+    const int64 m = input_shape.dim_size(rank - 2);
+    const int64 n = input_shape.dim_size(rank - 1);
+    const int64 min_dim = std::min(m, n);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+
+    TensorShape expected_diag_shape = batch_shape;
+    expected_diag_shape.AddDim(min_dim);
+    OP_REQUIRES(context, expected_diag_shape == diag_shape,
+                errors::InvalidArgument(
+                    "must have diagonal.shape == input.shape[:-2] + "
+                    "min(input.shape[-2:]), but received input shape: ",
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle diag = context->Input(1);
+
+    auto zero = XlaHelpers::Zero(builder, context->input_type(0));
+
+    // Create an indicator tensor that is true only on the diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, m, &iota_m));
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, n, &iota_n));
+    auto indicator = builder->Eq(iota_m,
+                                 builder->Broadcast(iota_n, {m}),
+                                 /*broadcast_dimensions=*/{0});
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    // Broadcast diag up to the input shape. Use an implicit broadcast (Add)
+    // because we need to broadcast on the right.
+    std::vector<int64> diag_broadcast_dims(rank - 1);
+    std::iota(diag_broadcast_dims.begin(), diag_broadcast_dims.end(), 0);
+    if (min_dim != m) {
+      diag_broadcast_dims.back() = rank - 1;
+    }
+    diag = builder->Add(diag, builder->Broadcast(zero, input_shape.dim_sizes()),
+                        /*broadcast_dimensions=*/diag_broadcast_dims);
+
+    auto output = builder->Select(indicator, diag, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
+};
+
+REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
new file mode 100644
index 00000000000..eaed9314646
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixTriangularSolveOp : public XlaOpKernel {
+ public:
+  explicit MatrixTriangularSolveOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint", &adjoint_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result = TriangularSolve(
+        ctx->builder(), ctx->Input(0), ctx->Input(1), /*left_side=*/true,
+        /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
+    if (!result.ok()) {
+      ctx->SetStatus(result.status());
+      return;
+    }
+    ctx->SetOutput(0, result.ValueOrDie());
+  }
+
+ private:
+  bool lower_;
+  bool adjoint_;
+};
+
+REGISTER_XLA_OP(Name("MatrixTriangularSolve"), MatrixTriangularSolveOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 0b5a38967ae..d4fb5dd4e06 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -37,21 +37,23 @@ class PoolingOp : public XlaOpKernel {
  public:
   PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    std::vector<int32> ksize_int;
-    std::vector<int32> stride_int;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
-    OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
-    OP_REQUIRES(ctx, stride_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    for (int i = 0; i < num_dims(); ++i) {
-      ksize_.push_back(ksize_int[i]);
-      stride_.push_back(stride_int[i]);
+    if (ctx->num_inputs() == 1) {
+      std::vector<int32> ksize_int;
+      std::vector<int32> stride_int;
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
+      OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
+      OP_REQUIRES(ctx, stride_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      for (int i = 0; i < num_dims(); ++i) {
+        ksize_.push_back(ksize_int[i]);
+        stride_.push_back(stride_int[i]);
+      }
     }
     Padding padding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding));
@@ -77,6 +79,33 @@ class PoolingOp : public XlaOpKernel {
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
 
+    std::vector<int64> ksize = ksize_;
+    std::vector<int64> stride = stride_;
+    if (ctx->num_inputs() != 1) {
+      const TensorShape ksize_shape = ctx->InputShape(1);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      ksize.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize));
+
+      const TensorShape stride_shape = ctx->InputShape(2);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      stride.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride));
+    }
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
@@ -84,8 +113,8 @@ class PoolingOp : public XlaOpKernel {
 
     const DataType type = input_type(0);
     xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
-        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize_,
-        stride_, padding_);
+        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize,
+        stride, padding_);
     ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape));
   }
 
@@ -130,6 +159,10 @@ class MaxPool2DOp : public MaxPoolOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
+REGISTER_XLA_OP(Name("MaxPoolV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DOp);
 
 class MaxPool3DOp : public MaxPoolOp {
  public:
@@ -243,22 +276,44 @@ class MaxPoolGradOp : public XlaOpKernel {
  public:
   MaxPoolGradOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(ctx, ksize_.size() == num_dims(),
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
-    OP_REQUIRES(ctx, stride_.size() == num_dims(),
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
+    if (ctx->num_inputs() == 3) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
+    }
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
   }
 
   int num_dims() const { return num_spatial_dims_ + 2; }
 
   void Compile(XlaOpKernelContext* ctx) override {
+    if (ctx->num_inputs() != 3) {
+      OP_REQUIRES(
+          ctx, ctx->num_inputs() == 5,
+          errors::InvalidArgument("Must supply ksize and stride arguments."));
+      const TensorShape ksize_shape = ctx->InputShape(3);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(3, &ksize_));
+
+      const TensorShape stride_shape = ctx->InputShape(4);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(4, &stride_));
+    }
+
+    OP_REQUIRES(ctx, ksize_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(ctx, stride_.size() == num_dims(),
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+
     const TensorShape tensor_in_shape = ctx->InputShape(0);
     const TensorShape tensor_out_shape = ctx->InputShape(1);
     const TensorShape out_backprop_shape = ctx->InputShape(2);
@@ -315,6 +370,10 @@ class MaxPool2DGradOp : public MaxPoolGradOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPoolGrad"), MaxPool2DGradOp);
+REGISTER_XLA_OP(Name("MaxPoolGradV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DGradOp);
 
 class MaxPool3DGradOp : public MaxPoolGradOp {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
new file mode 100644
index 00000000000..6bc5d3adb09
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class ReverseSequenceOp : public XlaOpKernel {
+ public:
+  explicit ReverseSequenceOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("batch_dim", &batch_dim_));
+    OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape seq_lens_shape = context->InputShape(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens_shape),
+                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
+                                        seq_lens_shape.dims()));
+    OP_REQUIRES(context, batch_dim_ != seq_dim_,
+                errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim_));
+    OP_REQUIRES(
+        context, seq_dim_ < input_shape.dims(),
+        errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+                                seq_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context, batch_dim_ < input_shape.dims(),
+        errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+                                batch_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context,
+        seq_lens_shape.num_elements() == input_shape.dim_size(batch_dim_),
+        errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim_,
+                                "), ", "(", seq_lens_shape.num_elements(),
+                                " vs. ", input_shape.dim_size(batch_dim_)));
+
+    xla::ComputationBuilder* builder = context->builder();
+    const auto input = context->Input(0);
+    const auto seq_lens = context->Input(1);
+
+    const int64 batch_size = input_shape.dim_size(batch_dim_);
+
+    const DataType input_type = context->input_type(0);
+    const DataType seq_lens_type = context->input_type(1);
+    const int64 max_seq_len = input_shape.dim_size(seq_dim_);
+
+    xla::Shape input_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(input_type, input_shape,
+                                                  &input_xla_shape));
+    xla::Shape seq_lens_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(seq_lens_type, seq_lens_shape,
+                                                  &seq_lens_xla_shape));
+
+    const auto tuple_shape = xla::ShapeUtil::MakeTupleShape({
+        xla::ShapeUtil::MakeShape(seq_lens_xla_shape.element_type(), {}),
+        seq_lens_xla_shape,
+        input_xla_shape,
+    });
+
+    // For each entry in the batch, reverse the sequence.
+    // TODO(b/65689298): generalize the Map() operator to non-scalar cases and
+    // use it here, instead of a While loop.
+
+    // Condition: lambda (i, _, _): i < batch_size
+    auto condition_builder =
+        builder->CreateSubBuilder("reverse_sequence_condition");
+    {
+      auto param = condition_builder->Parameter(0, tuple_shape, "param");
+      auto i = condition_builder->GetTupleElement(param, 0);
+      condition_builder->Lt(
+          i, XlaHelpers::IntegerLiteral(condition_builder.get(), seq_lens_type,
+                                        batch_size));
+    }
+    auto condition = condition_builder->Build();
+    OP_REQUIRES_OK(context, condition.status());
+
+    auto body_builder = builder->CreateSubBuilder("reverse_sequence_body");
+    {
+      auto param = body_builder->Parameter(0, tuple_shape, "param");
+      auto i = body_builder->GetTupleElement(param, 0);
+      auto seq_lens = body_builder->GetTupleElement(param, 1);
+      auto output = body_builder->GetTupleElement(param, 2);
+
+      // seq_len is the sequence length of the current batch element (rank 1)
+      auto seq_len = body_builder->DynamicSlice(
+          seq_lens, body_builder->Reshape(i, {1}), {1});
+
+      // Indices is the offset of the batch element in the input.
+      auto indices = body_builder->Broadcast(
+          XlaHelpers::Zero(body_builder.get(), seq_lens_type),
+          {input_shape.dims()});
+      indices = body_builder->DynamicUpdateSlice(
+          indices, body_builder->Reshape(i, {1}),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         batch_dim_),
+              {1}));
+
+      // slice_indices is the offset of the start of the reversed sequence in
+      // the input.
+      auto slice_indices = body_builder->DynamicUpdateSlice(
+          indices,
+          body_builder->Sub(XlaHelpers::IntegerLiteral(
+                                body_builder.get(), seq_lens_type, max_seq_len),
+                            seq_len),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         seq_dim_),
+              {1}));
+
+      // Slice out the reversed sequence. The slice will overflow the end of the
+      // sequence, and the contents of the overflow are implementation-defined.
+      // However, we will mask off these elements and replace them with elements
+      // from the original input so their values do not matter.
+      TensorShape slice_shape = input_shape;
+      slice_shape.set_dim(batch_dim_, 1);
+      auto slice = body_builder->DynamicSlice(output, slice_indices,
+                                              slice_shape.dim_sizes());
+
+      // Shift the reversed sequence to the left.
+      output = body_builder->DynamicUpdateSlice(output, slice, indices);
+
+      body_builder->Tuple(
+          {body_builder->Add(
+               i, XlaHelpers::One(body_builder.get(), seq_lens_type)),
+           seq_lens, output});
+    }
+    auto body = body_builder->Build();
+    OP_REQUIRES_OK(context, body.status());
+
+    auto loop_output = builder->While(
+        condition.ValueOrDie(), body.ValueOrDie(),
+        builder->Tuple({XlaHelpers::Zero(builder, seq_lens_type), seq_lens,
+                        builder->Rev(input, {seq_dim_})}));
+    auto output = builder->GetTupleElement(loop_output, 2);
+
+    // Mask out elements after the sequence length.
+    xla::ComputationDataHandle iota;
+    OP_REQUIRES_OK(
+        context, XlaHelpers::Iota(builder, seq_lens_type, max_seq_len, &iota));
+    std::vector<int64> dims(input_shape.dims(), 1);
+    dims[batch_dim_] = batch_size;
+    auto mask = builder->Lt(iota, builder->Reshape(seq_lens, dims), {seq_dim_});
+
+    // Broadcast the mask up to the input shape.
+    mask =
+        builder->Or(mask, builder->Broadcast(builder->ConstantR0<bool>(false),
+                                             input_shape.dim_sizes()));
+
+    output = builder->Select(mask, output, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  int32 batch_dim_;
+  int32 seq_dim_;
+};
+
+REGISTER_XLA_OP(Name("ReverseSequence"), ReverseSequenceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index d77fb768ef4..1a78c7ab9be 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -77,10 +77,8 @@ Status MaybeInitializeStack(xla::ComputationBuilder* builder,
     // Stack has not been initialized.
     xla::ComputationDataHandle zero =
         XlaHelpers::Zero(builder, resource->type());
-    TF_RETURN_IF_ERROR(resource->SetValue(
-        dtype,
-        builder->Tuple({builder->Broadcast(zero, stack_shape.dim_sizes()),
-                        builder->ConstantR0<int32>(0)})));
+    TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
+    TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
     // Checks the expected shape matches the actual shape.
     TensorShape actual_shape;
@@ -119,8 +117,8 @@ class StackOp : public XlaOpKernel {
     string name = strings::StrCat("Stack: ", stack_name_);
     OP_REQUIRES_OK(
         ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_,
-                               value, &resource));
-    resource->set_tensor_array_size(size);
+                               TensorShape(), value, /*tensor_array_size=*/size,
+                               /*tensor_array_gradients=*/{}, &resource));
     ctx->SetResourceOutput(0, resource);
   }
 
@@ -164,11 +162,9 @@ class StackPushOp : public XlaOpKernel {
 
     // TODO(phawkins): We don't check the index is in bounds --- there is no
     // error mechanism in XLA.
-    OP_REQUIRES_OK(
-        ctx,
-        resource->SetValue(
-            dtype_, b->Tuple({b->DynamicUpdateSlice(ta, update, start_indices),
-                              b->Add(index, b->ConstantR0<int32>(1))})));
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple(
+                            {b->DynamicUpdateSlice(ta, update, start_indices),
+                             b->Add(index, b->ConstantR0<int32>(1))})));
 
     ctx->SetOutput(0, value);
   }
@@ -208,7 +204,7 @@ class StackPopOp : public XlaOpKernel {
     xla::ComputationDataHandle index = b->GetTupleElement(state, 1);
 
     index = b->Sub(index, b->ConstantR0<int32>(1));
-    OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, b->Tuple({ta, index})));
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple({ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     auto start_indices =
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index f0525a5fb86..91c169428c7 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -231,6 +231,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("new_axis_mask", &new_axis_mask_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shrink_axis_mask", &shrink_axis_mask_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -252,9 +253,9 @@ class StridedSliceAssignOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
                                             &strides_tensor));
 
-    DataType lhs_type;
     TensorShape lhs_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &lhs_type, &lhs_shape));
+    xla::ComputationDataHandle lhs;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
 
     const TensorShape rhs_shape = ctx->InputShape(4);
 
@@ -282,9 +283,6 @@ class StridedSliceAssignOp : public XlaOpKernel {
                     " does not match r-value shape ", rhs_shape.DebugString(),
                     ". Automatic broadcasting not yet implemented."));
 
-    xla::ComputationDataHandle lhs;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &lhs));
-
     xla::ComputationDataHandle rhs = ctx->Input(4);
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
@@ -320,13 +318,14 @@ class StridedSliceAssignOp : public XlaOpKernel {
           lhs, rhs, ctx->builder()->ConstantR1<int64>(slice_begin));
     }
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, lhs_type, lhs));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
   }
 
  private:
   int32 begin_mask_, end_mask_;
   int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
+  DataType dtype_;
 };
 
 REGISTER_XLA_OP(Name("ResourceStridedSliceAssign")
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 9224072a3cb..7cf9b796b91 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -62,15 +62,13 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
 
   TF_RET_CHECK(resource->tensor_array_size() >= 0)
       << resource->name() << " size " << resource->tensor_array_size();
-  TensorShape ta_shape;
-  ta_shape.AddDim(resource->tensor_array_size());
-  ta_shape.AppendShape(elem_shape);
 
   if (!resource->initialized()) {
     xla::ComputationDataHandle zero =
         XlaHelpers::Zero(builder, resource->type());
-    TF_RETURN_IF_ERROR(resource->SetValue(
-        dtype, builder->Broadcast(zero, ta_shape.dim_sizes())));
+
+    TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
+    TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
     // Checks the elem_shape matches the TensorArray shape.
     auto shape_or_status = builder->GetShape(resource->value());
@@ -80,6 +78,10 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
     TensorShape shape;
     TF_RETURN_IF_ERROR(
         XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+
+    TensorShape ta_shape;
+    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AppendShape(elem_shape);
     if (ta_shape != shape) {
       return errors::InvalidArgument(
           "Mismatched TensorArray sizes: ", ta_shape.DebugString(), " vs ",
@@ -114,10 +116,8 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
 Status GetTensorArrayShape(const XlaResource* resource,
                            xla::ComputationBuilder* builder,
                            TensorShape* shape) {
-  TF_RETURN_IF_ERROR(resource->GetShape(builder, shape));
-  if (shape->dims() < 1) {
-    return errors::InvalidArgument("TensorArray rank must be >= 1");
-  }
+  *shape = resource->shape();
+  shape->InsertDim(0, resource->tensor_array_size());
   return Status::OK();
 }
 
@@ -160,8 +160,8 @@ class TensorArrayOp : public XlaOpKernel {
     // Initializes the TensorArray value if we know the element shape.
     // Otherwise, defer initialization to the first write.
     xla::ComputationDataHandle value;
+    TensorShape shape;
     if (element_shape_.IsFullyDefined()) {
-      TensorShape shape;
       CHECK(element_shape_.AsTensorShape(&shape));
       TensorShape ta_shape;
       ta_shape.AddDim(size);
@@ -175,8 +175,8 @@ class TensorArrayOp : public XlaOpKernel {
     string name = strings::StrCat("TensorArray: ", tensor_array_name_);
     OP_REQUIRES_OK(
         ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
-                               dtype_, value, &var));
-    var->set_tensor_array_size(size);
+                               dtype_, shape, value, /*tensor_array_size=*/size,
+                               /*tensor_array_gradients=*/{}, &var));
     ctx->SetResourceOutput(0, var);
 
     Tensor flow(DT_FLOAT, TensorShape({}));
@@ -230,7 +230,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
     xla::ComputationDataHandle written =
         DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
 
-    OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, written));
+    OP_REQUIRES_OK(ctx, resource->SetValue(written));
     ctx->SetOutput(0, flow);
   }
 
@@ -421,7 +421,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
       }
     }
 
-    OP_REQUIRES_OK(ctx, resource->SetValue(dtype_, ta));
+    OP_REQUIRES_OK(ctx, resource->SetValue(ta));
     ctx->SetOutput(0, flow);
   }
 
@@ -525,9 +525,8 @@ class TensorArraySplitOp : public XlaOpKernel {
                                         value_shape.DebugString(), " vs. ",
                                         ta_shape.DebugString()));
 
-    OP_REQUIRES_OK(
-        ctx, resource->SetValue(
-                 dtype_, b->Add(ta, b->Reshape(value, ta_shape.dim_sizes()))));
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Add(
+                            ta, b->Reshape(value, ta_shape.dim_sizes()))));
 
     ctx->SetOutput(0, flow);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 5534d1bfa13..f750f7003be 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -32,9 +32,24 @@ class ResourceApplyGradientDescent : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationDataHandle handle;
     xla::ComputationBuilder* b = ctx->builder();
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    DataType type = ctx->input_type(1);
+    TensorShape var_shape;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &handle));
+
+    TensorShape alpha_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha_shape.DebugString()));
+
+    TensorShape delta_shape = ctx->InputShape(2);
+    OP_REQUIRES(
+        ctx, var_shape.IsSameSize(delta_shape),
+        errors::InvalidArgument("var and delta do not have the same shape: ",
+                                var_shape.DebugString(), " vs ",
+                                delta_shape.DebugString()));
+
     handle = b->Sub(handle, b->Mul(ctx->Input(1), ctx->Input(2)));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -52,18 +67,10 @@ class ResourceApplyMomentum : public XlaOpKernel {
 
     DataType type = ctx->input_type(2);
 
-    DataType var_type, accum_type;
     TensorShape var_shape, accum_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == accum_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyMomentum must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
-            DataTypeString(accum_type)));
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
     OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
                 errors::InvalidArgument(
@@ -86,10 +93,6 @@ class ResourceApplyMomentum : public XlaOpKernel {
                 errors::InvalidArgument("momentum is not a scalar: ",
                                         momentum_shape.DebugString()));
 
-    xla::ComputationDataHandle var, accum;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
-
     xla::ComputationDataHandle lr = ctx->Input(2);
     xla::ComputationDataHandle grad = ctx->Input(3);
     xla::ComputationDataHandle momentum = ctx->Input(4);
@@ -122,18 +125,10 @@ class ResourceApplyAdagrad : public XlaOpKernel {
 
     DataType type = ctx->input_type(2);
 
-    DataType var_type, accum_type;
     TensorShape var_shape, accum_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == accum_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyAdagrad must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
-            DataTypeString(accum_type)));
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
     OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
                 errors::InvalidArgument(
@@ -151,9 +146,6 @@ class ResourceApplyAdagrad : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, accum;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
     xla::ComputationDataHandle lr = ctx->Input(2);
     xla::ComputationDataHandle grad = ctx->Input(3);
 
@@ -175,18 +167,11 @@ class ResourceApplyAdam : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    DataType var_type, m_type, v_type;
     TensorShape var_shape, m_shape, v_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &m_type, &m_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &v_type, &v_shape));
-
-    OP_REQUIRES(
-        ctx, dtype_ == var_type && dtype_ == m_type && dtype_ == v_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyRMSProp must match: ",
-            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " vs. ",
-            DataTypeString(m_type), " vs. ", DataTypeString(v_type)));
+    xla::ComputationDataHandle var, m, v;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype_, &m_shape, &m));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &v_shape, &v));
 
     TensorShape beta1_power_shape = ctx->InputShape(3);
     TensorShape beta2_power_shape = ctx->InputShape(4);
@@ -228,10 +213,6 @@ class ResourceApplyAdam : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, m, v;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &m));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &v));
     xla::ComputationDataHandle beta1_power = ctx->Input(3);
     xla::ComputationDataHandle beta2_power = ctx->Input(4);
     xla::ComputationDataHandle lr = ctx->Input(5);
@@ -278,18 +259,11 @@ class ResourceApplyRMSProp : public XlaOpKernel {
 
     DataType type = ctx->input_type(3);
 
-    DataType var_type, ms_type, mom_type;
     TensorShape var_shape, ms_shape, mom_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &ms_type, &ms_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &mom_type, &mom_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == ms_type && type == mom_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyRMSProp must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " vs. ",
-            DataTypeString(ms_type), " vs. ", DataTypeString(mom_type)));
+    xla::ComputationDataHandle var, ms, mom;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &ms_shape, &ms));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, type, &mom_shape, &mom));
 
     TensorShape lr_shape = ctx->InputShape(3);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
@@ -323,10 +297,6 @@ class ResourceApplyRMSProp : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, ms, mom;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &ms));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &mom));
     xla::ComputationDataHandle lr = ctx->Input(3);
     xla::ComputationDataHandle rho = ctx->Input(4);
     xla::ComputationDataHandle momentum = ctx->Input(5);
@@ -373,20 +343,11 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
                  bool has_l2_shrinkage) {
   xla::ComputationBuilder* b = ctx->builder();
 
-  DataType var_type, accum_type, linear_type;
   TensorShape var_shape, accum_shape, linear_shape;
-  OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-  OP_REQUIRES_OK(ctx,
-                 ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-  OP_REQUIRES_OK(ctx,
-                 ctx->GetVariableTypeAndShape(2, &linear_type, &linear_shape));
-
-  OP_REQUIRES(
-      ctx, dtype == var_type && dtype == accum_type && dtype == linear_type,
-      errors::InvalidArgument(
-          "Types of variable arguments to ResourceApplyFtrlV2 must match: ",
-          DataTypeString(dtype), " vs. ", DataTypeString(var_type), " and ",
-          DataTypeString(accum_type), " and ", DataTypeString(linear_type)));
+  xla::ComputationDataHandle var, accum, linear;
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype, &var_shape, &var));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype, &accum_shape, &accum));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype, &linear_shape, &linear));
 
   OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
               errors::InvalidArgument(
@@ -438,10 +399,6 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
               errors::InvalidArgument("lr_power is not a scalar: ",
                                       lr_power_shape.DebugString()));
 
-  xla::ComputationDataHandle var, accum, linear;
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &linear));
   xla::ComputationDataHandle grad = ctx->Input(3);
   xla::ComputationDataHandle lr = ctx->Input(4);
   xla::ComputationDataHandle l1 = ctx->Input(5);
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index a266e9013c4..0c5ad9e5255 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -50,18 +50,41 @@ XLAJIT_MAKE_UNARY(Conj, b->Conj(x));
 // Return x if x>0, otherwise -x.
 XLAJIT_MAKE_UNARY(Abs, b->Abs(x));
 
+// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
+XLAJIT_MAKE_UNARY(
+    Acos,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                  b->Mul(x, x)),
+                           XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
+                    b->Add(XlaHelpers::One(b, input_type(0)), x))));
+
 // acosh(x) = log(x + sqrt(x^2 - 1))
 XLAJIT_MAKE_UNARY(
     Acosh,
     b->Log(b->Add(x, b->Pow(b->Sub(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+// asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
+XLAJIT_MAKE_UNARY(
+    Asin,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(x, b->Add(XlaHelpers::One(b, input_type(0)),
+                              b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                            b->Mul(x, x)),
+                                     XlaHelpers::FloatLiteral(b, input_type(0),
+                                                              0.5))))));
+
 // asinh(x) = log(x + sqrt(x^2 + 1))
 XLAJIT_MAKE_UNARY(
     Asinh,
     b->Log(b->Add(x, b->Pow(b->Add(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+XLAJIT_MAKE_UNARY(Atan, b->Atan2(x, XlaHelpers::One(b, input_type(0))));
+
 // atanh(x) = 0.5 * log((1 + x) / (1 - x))
 XLAJIT_MAKE_UNARY(
     Atanh, b->Mul(b->Log(b->Div(b->Add(XlaHelpers::One(b, input_type(0)), x),
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 68847ae7a2c..e4079ebf0b8 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -33,21 +33,29 @@ class VarIsInitializedOp : public XlaOpKernel {
  public:
   explicit VarIsInitializedOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle handle;
-    bool initialized = ctx->ReadVariableInput(0, &handle).ok();
-    ctx->SetOutput(0, ctx->builder()->ConstantR0<bool>(initialized));
+    XlaResource* variable;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &variable));
+    ctx->SetOutput(0,
+                   ctx->builder()->ConstantR0<bool>(variable->initialized()));
   }
 };
 REGISTER_XLA_OP(Name("VarIsInitializedOp"), VarIsInitializedOp);
 
 class ReadVariableOp : public XlaOpKernel {
  public:
-  explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(
+        ctx, ctx->ReadVariableInput(0, dtype_, /*shape=*/nullptr, &handle));
     ctx->SetOutput(0, handle);
   }
+
+ private:
+  DataType dtype_;
 };
 REGISTER_XLA_OP(Name("ReadVariableOp"), ReadVariableOp);
 
@@ -65,10 +73,12 @@ class AssignAddVariableOp : public XlaOpKernel {
  public:
   explicit AssignAddVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(1);
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Add(handle, ctx->Input(1));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -79,10 +89,12 @@ class AssignSubVariableOp : public XlaOpKernel {
  public:
   explicit AssignSubVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(1);
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Sub(handle, ctx->Input(1));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -95,28 +107,19 @@ class ResourceGatherOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
-    // Get the shape of the resource tensor.
+    DataType type = ctx->expected_output_dtype(0);
+
     TensorShape resource_shape;
-    DataType resource_dtype;
-    OP_REQUIRES_OK(
-        ctx, ctx->GetVariableTypeAndShape(0, &resource_dtype, &resource_shape));
-
-    DataType expected_output_dtype = ctx->expected_output_dtype(0);
-    OP_REQUIRES(ctx, resource_dtype == expected_output_dtype,
-                errors::InvalidArgument(
-                    "Variable dtype is ", DataTypeString(resource_dtype),
-                    " but expected output dtype is ",
-                    DataTypeString(expected_output_dtype), "."));
-
     xla::ComputationDataHandle resource_handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &resource_handle));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &resource_shape,
+                                               &resource_handle));
 
     auto indices = ctx->Input(1);
     auto indices_shape = ctx->InputShape(1);
     DataType index_type = ctx->input_type(1);
     xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-        ctx, resource_handle, resource_shape, indices, indices_shape, 0,
-        resource_dtype, index_type, builder);
+        ctx, resource_handle, resource_shape, indices, indices_shape, 0, type,
+        index_type, builder);
     ctx->SetOutput(0, gather);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 4a711e4d9b7..0ff1b65ae91 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -58,9 +58,8 @@ Status MakeXlaCompilerArgumentsFromInputs(
       }
 
       arg.type = resource->type();
-      if (arg.initialized) {
-        TF_RETURN_IF_ERROR(resource->PackedShape(ctx->builder(), &arg.shape));
-      } else {
+      arg.shape = resource->shape();
+      if (!arg.initialized) {
         *has_uninitialized_vars = true;
       }
       arg.tensor_array_size = resource->tensor_array_size();
@@ -70,14 +69,13 @@ Status MakeXlaCompilerArgumentsFromInputs(
       arg.name = resource->name();
       VLOG(2) << "    resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << xla::ShapeUtil::HumanString(arg.shape)
+              << " shape: " << arg.shape.DebugString()
               << " initialized: " << arg.initialized;
 
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = ctx->input_type(i);
-      TF_RETURN_IF_ERROR(
-          TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape));
+      arg.shape = ctx->InputShape(i);
     }
   }
   return Status::OK();
@@ -154,17 +152,14 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       XlaCompiler::Argument& arg = arguments[update.input_index];
       if (!arg.initialized) {
         VLOG(2) << "Update shape for argument " << update.input_index << " "
-                << xla::ShapeUtil::HumanString(update.shape);
+                << update.shape.DebugString();
         arg.initialized = true;
 
-        xla::Shape shape = update.shape;
-        if (!update.tensor_array_gradients_accessed.empty()) {
-          shape = xla::ShapeUtil::GetTupleElementShape(shape, 0);
-        }
-        std::unique_ptr<xla::Literal> zero =
-            xla::Literal::CreateFromShape(shape);
-        OP_REQUIRES_OK(ctx, resource->SetValue(
-                                update.type, builder->ConstantLiteral(*zero)));
+        arg.shape = update.shape;
+        OP_REQUIRES_OK(ctx,
+                       resource->SetTypeAndShape(update.type, update.shape));
+
+        OP_REQUIRES_OK(ctx, resource->SetZeroValue(builder));
       }
 
       // Add any TensorArray gradients touched by the body to the enclosing
@@ -182,9 +177,6 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
-
-      // Recompute the argument shape.
-      OP_REQUIRES_OK(ctx, resource->PackedShape(ctx->builder(), &arg.shape));
     }
     // Recompile the body with the "correct" resource shapes.
     VLOG(1) << "Recompiling body with corrected resource shapes";
@@ -292,13 +284,12 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       OP_REQUIRES_OK(ctx,
                      resource->SetFromPack(
                          arguments[update.input_index].tensor_array_gradients,
-                         builder->GetTupleElement(while_result, pos),
-                         /*reset_initial_values=*/false, builder));
+                         builder->GetTupleElement(while_result, pos), builder));
     }
     VLOG(2) << "Loop-carried variable: pos: " << update.input_index
             << " name: " << resource->name() << " modified: " << update.modified
             << " type: " << DataTypeString(update.type)
-            << " shape: " << xla::ShapeUtil::HumanString(update.shape);
+            << " shape: " << update.shape.DebugString();
     // Copies the identity of the resource variable from input to output
     // unchanged, even if the variable was not modified.
     ctx->op_kernel_context()->set_output(
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 21ad21f7373..d184f59e017 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -60,6 +60,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 9b0e6174475..798f0fa7805 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -25,11 +25,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The current implementation simply unrolls the computation along the batch
-// dimension.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) {
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x, bool conjugate_y) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> x_shape,
                       builder->GetShape(x));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> y_shape,
@@ -89,10 +88,10 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
         dimensions);
   }
 
-  if (x_shape->element_type() == xla::C64 && transpose_x) {
+  if (x_shape->element_type() == xla::C64 && conjugate_x) {
     x = builder->Conj(x);
   }
-  if (y_shape->element_type() == xla::C64 && transpose_y) {
+  if (y_shape->element_type() == xla::C64 && conjugate_y) {
     y = builder->Conj(y);
   }
 
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index b46bc7417d2..b230e885f10 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -27,7 +27,10 @@ namespace tensorflow {
 // viewed as an element of a batch), and arranges the individual results
 // in a single output tensor of the same batch size. Each of the
 // individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`.
+// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
+// can be elementwise-complex-conjugated by setting the `conjugate_x` or
+// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
+// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
 //
 // The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
 // and `[..., r_y, c_y]`.
@@ -40,11 +43,10 @@ namespace tensorflow {
 // It is computed as:
 //
 //     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-// TODO(phawkins): add an option to take the complex conjugate of the LHS or
-// RHS.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y);
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x = false, bool conjugate_y = false);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index b3cc489adf6..e795701181d 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -71,11 +71,14 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
                           SliceInMinorDims(builder, l, {j + 1, 0}, {n, j}));
       TF_ASSIGN_OR_RETURN(auto r_squared,
                           BatchDot(builder, r, r, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       new_d_squared = builder->Sub(new_d_squared, r_squared);
 
       TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false,
-                                       /*transpose_y=*/true));
+                                       /*transpose_y=*/true,
+                                       /*conjugate_x=*/false,
+                                       /*conjugate_y=*/false));
     }
     auto new_d_inv = builder->Pow(
         new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5));
@@ -134,7 +137,8 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, l, {i, 0}, {i + k, i}));
       TF_ASSIGN_OR_RETURN(auto delta,
                           BatchDot(builder, lhs, rhs, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       TF_ASSIGN_OR_RETURN(auto before,
                           SliceInMinorDims(builder, a, {i, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(
@@ -155,6 +159,10 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(auto update,
                           TriangularSolve(builder, factorized, panel,
+                                          /*left_side=*/false,
+                                          /*lower=*/true,
+                                          /*transpose_a=*/true,
+                                          /*conjugate_a=*/false,
                                           /*block_size=*/8));
       TF_ASSIGN_OR_RETURN(
           l, UpdateSliceInMinorDims(builder, l, update, {i + k, i}));
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 2bead7359ba..e083a383be4 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -29,6 +29,7 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
+// TODO(mattjj): handle the complex Hermitian case
 xla::StatusOr<xla::ComputationDataHandle> Cholesky(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
     int64 block_size = 256);
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 579944c3a38..7f72a6073df 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -24,13 +24,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size) {
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
                       builder->GetShape(a));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
@@ -60,14 +62,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     batch_dimensions.push_back(a_size);
   }
 
-  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
-  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
-  if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+  if (xla::ShapeUtil::GetDimension(*a_shape, -1) !=
+      xla::ShapeUtil::GetDimension(*a_shape, -2)) {
     return errors::InvalidArgument(
         "The 'a' arguments to TriangularSolve must be square matrices: ",
         xla::ShapeUtil::HumanString(*a_shape));
   }
-  if (n != xla::ShapeUtil::GetDimension(*b_shape, -1)) {
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(*a_shape, -1)) {
     return errors::InvalidArgument(
         "Arguments to TriangularSolve have incompatible matrix shapes: ",
         xla::ShapeUtil::HumanString(*a_shape), " vs ",
@@ -89,6 +92,14 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     return output;
   };
 
+  // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
+  // is true, otherwise returns its argument.
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
   std::map<int, xla::Computation> base_computations;
   auto get_base_triangular_solve =
       [&](int k) -> xla::StatusOr<xla::Computation*> {
@@ -103,19 +114,35 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                                                    prepend_batch_dims({k, k})),
                          "a");
 
+      std::array<int64, 2> b_lastd;
+      if (left_side) {
+        b_lastd = {k, n};
+      } else {
+        b_lastd = {m, k};
+      }
       auto b_param =
           sub->Parameter(1,
                          xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims({m, k})),
+                                                   prepend_batch_dims(b_lastd)),
                          "b");
 
-      // TODO(phawkins): it might make sense to use a while loop here, rather
-      // than unrolling.
-      // TODO(phawkins): the left-looking variant of the algorithm might be more
-      // efficient at block size 1.
-      TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
-                                         /*block_size=*/1)
-                             .status());
+      // We use a left-looking subroutine on the block diagonal in some common
+      // cases, while falling back to a recursive call in unsupported cases. The
+      // left-looking subroutine is written with a While loop and so yields much
+      // faster compile times. Moreover, the left-looking variant can give
+      // higher performance on smaller (sub)problems.
+      if (left_side && lower) {
+        TF_RETURN_IF_ERROR(TriangularSolveLeftLooking(sub.get(), a_param,
+                                                      b_param, transpose_a,
+                                                      conjugate_a)
+                               .status());
+      } else {
+        TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
+                                           left_side, lower, transpose_a,
+                                           conjugate_a,
+                                           /*block_size=*/1)
+                               .status());
+      }
 
       TF_ASSIGN_OR_RETURN(computation, sub->Build());
     }
@@ -129,47 +156,396 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
   // Goto, Kazushige, and Robert Van De Geijn. "High-performance implementation
   // of the level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1
   // (2008): 4.
-  for (int64 i = 0; i < n; i += block_size) {
-    int64 k = std::min(block_size, n - i);
 
-    // if k > 1:
-    //   output[..., :, i:i+k] = triangular_solve(
-    //       a[..., i:i+k, ..., i:i+k], b[..., :, i:i+k], side='Right',
-    //       kind='Lower', transpose=True, block_size=1)
-    // else:
-    //   output[..., :, i] = b[..., :, i] / a[..., i, i]
-    TF_ASSIGN_OR_RETURN(auto a_slice,
-                        SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
-    TF_ASSIGN_OR_RETURN(auto b_slice,
-                        SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
-    xla::ComputationDataHandle update;
-    if (k > 1) {
-      TF_ASSIGN_OR_RETURN(xla::Computation * solve,
-                          get_base_triangular_solve(k));
-      update = builder->Call(*solve, {a_slice, b_slice});
-    } else {
-      update = builder->Div(b_slice, a_slice);
+  // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if
+  // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if
+  // conjugate_a is True.
+
+  if (!left_side && lower == transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < n; i += block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, i+k:] -= np.matmul(output[..., :, i:i+k], a_slice_2)
+      if (i + k < n) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, n}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+      }
     }
 
-    TF_ASSIGN_OR_RETURN(
-        output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
-    // b[..., :, i+k:] -= np.dot(output[..., :, i:i+k],
-    //                           np.transpose(..., a[i+k:, i:i+k]))
-    if (i + k < n) {
-      TF_ASSIGN_OR_RETURN(auto a_slice_2,
-                          SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
-      TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, update, a_slice_2,
-                                                  /*transpose_x=*/false,
-                                                  /*transpose_y=*/true));
+  } else if (left_side && lower != transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < m; i += block_size) {
+      int64 k = std::min(block_size, m - i);
 
-      TF_ASSIGN_OR_RETURN(auto b_slice_2,
-                          SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
-      b_update = builder->Sub(b_slice_2, b_update);
+      // output[..., i:i+k, :] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
       TF_ASSIGN_OR_RETURN(
-          b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., i+k:, :] -= np.matmul(a_slice_2, output[..., i:i+k, :])
+      if (i + k < m) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {m, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, m}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {i + k, 0}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {i + k, 0}));
+      }
+    }
+  } else if (!left_side && lower != transpose_a) {
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(n, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, :i] -= np.matmul(out[..., :, i:i+k], a_slice_2)
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {m, i}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
+    }
+  } else {  // left_side && lower == transpose_a
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(m, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, m - i);
+
+      // output[..., i:i+k, :] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :i, :] -= np.matmul(a_slice_2, out[..., i:i+k, :])
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {i, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
     }
   }
+
   return output;
 }
 
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
+                      builder->GetShape(b));
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  const int64 ndims = xla::ShapeUtil::Rank(*a_shape);
+
+  std::vector<int64> batch_dimensions;
+  for (int i = 0; i < ndims - 2; ++i) {
+    int64 a_size = a_shape->dimensions(i);
+    batch_dimensions.push_back(a_size);
+  }
+
+  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
+    std::vector<int64> output(ndims);
+    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
+    std::copy(indices.begin(), indices.end(),
+              output.begin() + batch_dimensions.size());
+    return output;
+  };
+
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
+  // The main computation is performed in a While loop.
+
+  // Allocate the output and set its first or last row,
+  // output = np.zeros_like(b)
+  // if transpose_a:
+  //   output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:]
+  // else:
+  //   output[..., :1, :] = b[..., :1, :] / a[..., :1, :1]
+  xla::ComputationDataHandle output = Zeros(builder, *b_shape);
+  {
+    auto i = transpose_a ? m - 1 : 0;
+    TF_ASSIGN_OR_RETURN(auto a_slice,
+                        SliceInMinorDims(builder, a, {i, i}, {i + 1, i + 1}));
+    TF_ASSIGN_OR_RETURN(auto b_slice,
+                        SliceInMinorDims(builder, b, {i, 0}, {i + 1, n}));
+    auto update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+    TF_ASSIGN_OR_RETURN(
+        output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+  }
+
+  // Construct the initial loop carry tuple,
+  // if transpose_a:
+  //   init = (m-2, output, a, b)
+  // else:
+  //   init = (1, output, a, b)
+  std::vector<xla::Shape> tuple_shapes = {
+      // The loop iteration counter is a scalar, incremented each iteration.
+      xla::ShapeUtil::MakeShape(xla::S32, {}),
+      // The output has the shape of b, with one row updated each iteration.
+      *b_shape,
+      // The coefficient matrix a is a loop invariant.
+      *a_shape,
+      // The right-hand-side matrix b is a loop invariant.
+      *b_shape};
+  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+  auto init_i = builder->ConstantR0<int32>(transpose_a ? m - 2 : 1);
+  auto init = builder->Tuple({init_i, output, a, b});
+
+  // Construct the loop condition function,
+  // def cond_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   return i >= 0 if transpose_a else i < m
+  std::unique_ptr<xla::ComputationBuilder> condb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond");
+  {
+    auto i = condb->GetTupleElement(
+        condb->Parameter(0, tuple_shape,
+                         "TriangularSolveLeftLookingWhileTuple"),
+        0);
+    if (transpose_a) {
+      condb->Ge(i, condb->ConstantR0<int32>(0));
+    } else {
+      condb->Lt(i, condb->ConstantR0<int32>(m));
+    }
+  }
+  TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
+
+  // Construct the loop body function,
+  // def body_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   if transpose_a:
+  //     a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2)
+  //   else:
+  //     a_row = a[..., i:i+1, :i]
+  //   result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :])
+  //   output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+  //   if transpose_a:
+  //     return (i - 1, output, a, b)
+  //   else:
+  //     return (i + 1, output, a, b)
+  // We have to do some extra FLOPs propagating zeros in the matrix multiply
+  // because we can't have the size of its arguments depend on the loop counter.
+  std::unique_ptr<xla::ComputationBuilder> bodyb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody");
+  {
+    auto input_tuple = bodyb->Parameter(0, tuple_shape,
+                                        "TriangularSolveLeftLookingWhileTuple");
+
+    // i, output, a, b = loop_carry
+    auto i = bodyb->GetTupleElement(input_tuple, 0);
+    auto body_out = bodyb->GetTupleElement(input_tuple, 1);
+    auto body_a = bodyb->GetTupleElement(input_tuple, 2);
+    auto body_b = bodyb->GetTupleElement(input_tuple, 3);
+    auto zero = bodyb->ConstantR0<int32>(0);
+
+    // Set up some helper functions.
+    auto prepend_zeros = [&](std::array<xla::ComputationDataHandle, 2> starts) {
+      auto zero = bodyb->Reshape(bodyb->ConstantR0<int32>(0), {1});
+      std::vector<xla::ComputationDataHandle> padded_starts(ndims, zero);
+      padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1});
+      padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1});
+      return bodyb->ConcatInDim(padded_starts, 0);
+    };
+
+    auto dynamic_slice = [&](xla::ComputationDataHandle x,
+                             std::array<xla::ComputationDataHandle, 2> starts,
+                             std::array<int64, 2> sizes) {
+      auto padded_starts = prepend_zeros(starts);
+      auto padded_sizes = prepend_batch_dims(sizes);
+      return bodyb->DynamicSlice(x, padded_starts, padded_sizes);
+    };
+
+    auto update = [&](xla::ComputationDataHandle x,
+                      xla::ComputationDataHandle update,
+                      std::array<xla::ComputationDataHandle, 2> starts) {
+      auto padded_starts = prepend_zeros(starts);
+      return bodyb->DynamicUpdateSlice(x, update, padded_starts);
+    };
+
+    // We'd like to implement this:
+    //   if transpose_a:
+    //     a_row = T(a[..., i+1:, i:i+1])
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a_row, body_out[..., i+1:, :]))
+    //   else:
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :]))
+    // But since we can't have intermediate array sizes depend on the loop
+    // counter, we instead exploit the fact that we initialized the output to
+    // all zeros and use that as zero-padding (doing unnecessary FLOPs).
+    xla::ComputationDataHandle a_row;
+    if (transpose_a) {
+      a_row = dynamic_slice(body_a, {zero, i}, {m, 1});
+    } else {
+      a_row = dynamic_slice(body_a, {i, zero}, {1, m});
+    }
+    TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out,
+                                                /*transpose_x=*/transpose_a,
+                                                /*transpose_y=*/false,
+                                                /*conjugate_x=*/conjugate_a,
+                                                /*conjugate_y=*/false));
+    auto result_row =
+        bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update);
+
+    // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+    auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1});
+    auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
+    body_out = update(body_out, div_result, {i, zero});
+
+    // if transpose_a:
+    //   return (i - 1, body_out, a, b)
+    // else:
+    //   return (i + 1, body_out, a, b)
+    auto next_i = bodyb->Add(i, bodyb->ConstantR0<int32>(transpose_a ? -1 : 1));
+    bodyb->Tuple({next_i, body_out, body_a, body_b});
+  }
+  TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+  // Construct the While loop and return the result,
+  // return while_loop(cond_fun, body_fun, init)[1]
+  auto triangular_solve_left_looking_while = builder->While(cond, body, init);
+  return builder->GetTupleElement(triangular_solve_left_looking_while, 1);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index 501d026411c..e32223bfddd 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -21,25 +21,50 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-// backsubstitution.
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+// That is, the innermost matrices in the output satisfy a scalar system
+// depending on the value of the value of (left_side, transpose_a, conjugate_a)
+// according to:
+//   (F, F, F) => `output[..., i, k]  a[..., k, j] = b[..., i, j]`,
+//   (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`,
+//   (F, T, F) => `output[..., i, k]  a[..., j, k] = b[..., i, j]`,
+//   (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`,
+//   (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`,
+//   (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`,
+// where * denotes complex conjugation and where the index `k` is summed over.
 //
-// `a` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. The strictly upper triangular part of each inner-most matrix
-// is assumed to be zero and not accessed.
-// `b` is a tensor of shape `[..., M, K]`.
-//
-// The innermost matrices in the output satisfy matrix equations
-// `output[..., i, j] * adjoint(a[..., k, j]) = b[..., i, k]`.
+// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+// square matrices. If lower is true (false), then the strictly upper (lower)
+// triangular part of each innermost matrix in `a` is assumed to be zero and is
+// not accessed.
+// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a
+// tensor of shape `[..., K, M]`.
+// `left_side` is a boolean, indicating whether to solve a system of the form
+// op(a) * x = b (true) or x * op(a) = b (false).
+// `lower` is a boolean, indicating whether the argument `a` is lower-triangular
+// (true) or upper-triangular (false).
+// `transpose_a` is a boolean indicating whether the matrix `a` is transposed.
+// `conjugate_a` is a boolean indicating whether the entries of `a` are complex
+// conjugated (independently of whether they are transposed), so that when both
+// transpose_a and conjugate_a are true the effect is a Hermitian adjoint.
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-// TODO(phawkins): equivalent to the BLAS TRSM routine with side=right,
-// kind=lower, and transposed_a=true. Implement the other possible combinations
-// of side, kind and transposed_a.
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size = 256);
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size = 256);
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
index 671d9aa4fe0..66170706291 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -27,32 +27,68 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
 
 using TriangularSolveTest = xla::ClientLibraryTestBase;
+using TriangularSolveLeftLookingTest = xla::ClientLibraryTestBase;
+using complex64 = xla::complex64;
 
-XLA_TEST_F(TriangularSolveTest, Simple) {
+xla::Array2D<float> AValsLower() {
+  return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+xla::Array2D<float> AValsUpper() {
+  return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}};
+}
+
+xla::Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<complex64> AValsLowerComplex() {
+  return {{2, 0, 0, 0},
+          {complex64(3, 1), 6, 0, 0},
+          {4, complex64(7, 2), 9, 0},
+          {5, 8, complex64(10, 3), 11}};
+}
+
+xla::Array2D<complex64> AValsUpperComplex() {
+  return {{2, 3, complex64(4, 3), 5},
+          {0, 6, complex64(7, 2), 8},
+          {0, 0, complex64(9, 1), 10},
+          {0, 0, 0, 11}};
+}
+
+xla::Array2D<complex64> BValsRightComplex() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<complex64> BValsLeftComplex() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<float> AValsFull() {
+  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
   xla::ComputationBuilder builder(client_, TestName());
 
-  xla::Array2D<float> a_vals({
-      {2, 0, 0, 0},
-      {3, 6, 0, 0},
-      {4, 7, 9, 0},
-      {5, 8, 10, 11},
-  });
-  xla::Array2D<float> b_vals({
-      {1, 2, 3, 4},
-      {5, 6, 7, 8},
-      {9, 10, 11, 12},
-  });
-
   xla::ComputationDataHandle a, b;
-  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(b_vals, 1, "b", &builder, &b);
-  auto result = TriangularSolve(&builder, a, b, /*block_size=*/2);
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
   TF_ASSERT_OK(result.status());
 
   xla::Array2D<float> expected({
@@ -62,7 +98,267 @@ XLA_TEST_F(TriangularSolveTest, Simple) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(2e-3, 2e-3));
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 0.08333334, 0.04629629, 0.03367003},
+      {2.5, -0.25, -0.1388889, -0.1010101},
+      {4.5, -0.58333331, -0.32407406, -0.23569024},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/true,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, complex64(0.08333333, 0.08333333),
+       complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)},
+      {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963),
+       complex64(0.08670034, -0.02104377)},
+      {4.5, complex64(-0.58333333, 0.75), complex64(-0.49074074, -0.71296296),
+       complex64(0.11026936, -0.03114478)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, 1., 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {complex64(0.20020325, -2.81504065e-01),
+       complex64(0.13821138, -4.22764228e-01),
+       complex64(0.07621951, -5.64024390e-01)},
+      {complex64(0.19678492, 2.55912786e-01),
+       complex64(0.17738359, 3.84331116e-01),
+       complex64(0.15798226, 5.12749446e-01)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index ce24b61b5dc..9b7492f8cf6 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -107,4 +107,15 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
   return UpdateSlice(builder, x, update, padded_start);
 }
 
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_RET_CHECK(n_dims >= 2);
+  std::vector<int64> permutation(n_dims);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+  return builder->Transpose(x, permutation);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index fb138b4f736..7f93102ee78 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -49,6 +49,10 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
 
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 906f2290433..6051d7dffd7 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -241,9 +241,7 @@ Status CreateXlaArgs(const Graph& graph,
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
-    TensorShape shape;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape));
-    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, &arg.shape));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 69b265436bb..c5b4ec5b15f 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -66,13 +66,14 @@ Status CheckSignature(const DataTypeVector& types,
 
 bool XlaCompiler::Argument::operator==(
     const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, resource_kind, type, name, tensor_array_size,
+  if (std::tie(kind, resource_kind, type, name, initialized, tensor_array_size,
                tensor_array_gradients) !=
       std::tie(other.kind, other.resource_kind, other.type, other.name,
-               other.tensor_array_size, other.tensor_array_gradients)) {
+               other.initialized, other.tensor_array_size,
+               other.tensor_array_gradients)) {
     return false;
   }
-  if (!xla::ShapeUtil::Equal(shape, other.shape)) {
+  if (shape != other.shape) {
     return false;
   }
   if (constant_value.shape() != other.constant_value.shape()) {
@@ -230,6 +231,64 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
   return Status::OK();
 }
 
+// Computes the XLA shape for argument 'arg'.
+/*static*/ Status XlaCompiler::XLAShapeForArgument(
+    const XlaCompiler::Argument& arg, xla::Shape* xla_shape) {
+  switch (arg.kind) {
+    case XlaCompiler::Argument::kConstant:
+      return TensorShapeToXLAShape(arg.type, arg.constant_value.shape(),
+                                   xla_shape);
+    case XlaCompiler::Argument::kParameter:
+      return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+    case XlaCompiler::Argument::kResource: {
+      TF_RET_CHECK(arg.initialized);
+
+      switch (arg.resource_kind) {
+        case XlaResource::kVariable:
+          return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+        case XlaResource::kTensorArray: {
+          if (arg.tensor_array_size < 0) {
+            return errors::InvalidArgument(
+                "Negative tensor_array_size in XLAShapeForArgument");
+          }
+          TensorShape shape;
+          shape.AddDim(arg.tensor_array_size);
+          shape.AppendShape(arg.shape);
+          TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
+
+          if (!arg.tensor_array_gradients.empty()) {
+            std::vector<xla::Shape> tuple_shape(
+                arg.tensor_array_gradients.size() + 1, *xla_shape);
+            *xla_shape = xla::ShapeUtil::MakeTupleShape(tuple_shape);
+          }
+          return Status::OK();
+        }
+        case XlaResource::kStack: {
+          if (arg.tensor_array_size < 0) {
+            return errors::InvalidArgument(
+                "Negative tensor_array_size in XLAShapeForArgument");
+          }
+          TensorShape shape;
+          shape.AddDim(arg.tensor_array_size);
+          shape.AppendShape(arg.shape);
+          xla::Shape buffer_shape;
+          TF_RETURN_IF_ERROR(
+              TensorShapeToXLAShape(arg.type, shape, &buffer_shape));
+          *xla_shape = xla::ShapeUtil::MakeTupleShape(
+              {buffer_shape, xla::ShapeUtil::MakeShape(xla::S32, {})});
+          return Status::OK();
+        }
+
+        case XlaResource::kInvalid:
+          return errors::Internal(
+              "Invalid resource type in XLAShapeForArgument()");
+      }
+    }
+    case XlaCompiler::Argument::kInvalid:
+      return errors::Internal("Invalid argument type in XLAShapeForArgument()");
+  }
+}
+
 namespace {
 
 Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
@@ -275,8 +334,9 @@ Status BuildArguments(const Graph& graph,
 
   // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
-  std::vector<int> parameters, resources;
-  parameters.reserve(args.size());
+  input_mapping->clear();
+  input_mapping->reserve(args.size());
+  std::vector<int> resources;
   resources.reserve(args.size());
 
   // Fills in constant arguments, and computes non-constant argument order.
@@ -290,18 +350,20 @@ Status BuildArguments(const Graph& graph,
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
         XlaResource* resource;
-        TF_RETURN_IF_ERROR(
-            context->CreateResource(arg.resource_kind, i, arg.name, arg.type,
-                                    xla::ComputationDataHandle(), &resource));
-        resource->set_tensor_array_size(arg.tensor_array_size);
+        TF_RETURN_IF_ERROR(context->CreateResource(
+            arg.resource_kind, i, arg.name, arg.type, arg.shape,
+            xla::ComputationDataHandle(),
+            /*tensor_array_size=*/arg.tensor_array_size,
+            /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
         arg_expression.set_resource(resource);
         if (arg.initialized) {
           resources.push_back(i);
         }
         break;
-      case XlaCompiler::Argument::kParameter:
-        parameters.push_back(i);
+      case XlaCompiler::Argument::kParameter: {
+        input_mapping->push_back(i);
         break;
+      }
       case XlaCompiler::Argument::kConstant:
         arg_expression.set_constant_value(arg.constant_value);
         break;
@@ -312,19 +374,17 @@ Status BuildArguments(const Graph& graph,
 
   // Append parameters containing variable values after the other runtime
   // parameters.
-  parameters.insert(parameters.end(), resources.begin(), resources.end());
-  if (parameters.empty()) {
+  input_mapping->insert(input_mapping->end(), resources.begin(),
+                        resources.end());
+  if (input_mapping->empty()) {
     return Status::OK();
   }
 
-  std::vector<xla::Shape> arg_shapes;
-  arg_shapes.reserve(parameters.size());
-  input_mapping->resize(parameters.size());
-  for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-    const XlaCompiler::Argument& arg = args[parameters[i]];
+  std::vector<xla::Shape> arg_shapes(input_mapping->size());
+  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
     // Computes the shapes of non-constant arguments.
-    arg_shapes.push_back(arg.shape);
-    (*input_mapping)[i] = parameters[i];
+    TF_RETURN_IF_ERROR(XlaCompiler::XLAShapeForArgument(
+        args[(*input_mapping)[i]], &arg_shapes[i]));
   }
 
   if (use_tuple_arg) {
@@ -354,13 +414,13 @@ Status BuildArguments(const Graph& graph,
   }
 
   // Build parameter handles for non-constant arguments.
-  std::vector<xla::ComputationDataHandle> arg_handles(parameters.size());
+  std::vector<xla::ComputationDataHandle> arg_handles(input_mapping->size());
   if (use_tuple_arg) {
     xla::ComputationDataHandle tuple;
     if (is_entry_computation) {
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
-      for (int64 parameter : parameters) {
+      for (int64 parameter : *input_mapping) {
         const int core = (*arg_cores)[parameter];
         const int root_device = 0;
         *tuple_sharding.add_tuple_shardings() =
@@ -373,16 +433,16 @@ Status BuildArguments(const Graph& graph,
     } else {
       tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple");
     }
-    for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-      const int core = (*arg_cores)[parameters[i]];
+    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+      const int core = (*arg_cores)[input_mapping->at(i)];
       xla::ScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] = builder->GetTupleElement(tuple, i);
     }
   } else {
-    for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-      const int core = (*arg_cores)[parameters[i]];
+    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+      const int core = (*arg_cores)[input_mapping->at(i)];
       xla::ScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -393,19 +453,18 @@ Status BuildArguments(const Graph& graph,
 
   // Fill in the handles in non-constant arguments.
   VLOG(2) << "XLA computation inputs:";
-  for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-    const XlaCompiler::Argument& arg = args[parameters[i]];
+  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+    const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
     VLOG(2) << "  XLA arg " << i
             << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i])
-            << " name: " << arg.name << " TF arg " << parameters[i];
-    XlaExpression& arg_expression = (*arg_expressions)[parameters[i]];
+            << " name: " << arg.name << " TF arg " << input_mapping->at(i);
+    XlaExpression& arg_expression = (*arg_expressions)[input_mapping->at(i)];
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.initialized);
         XlaResource* resource = arg_expression.resource();
-        TF_RETURN_IF_ERROR(
-            resource->SetFromPack(arg.tensor_array_gradients, arg_handles[i],
-                                  /*reset_initial_values=*/true, builder));
+        TF_RETURN_IF_ERROR(resource->SetFromPack(arg.tensor_array_gradients,
+                                                 arg_handles[i], builder));
         VLOG(2) << "    resource: num_gradients: "
                 << arg.tensor_array_gradients.size();
         break;
@@ -486,6 +545,7 @@ Status BuildComputation(
       XlaCompiler::ResourceUpdate& update = resource_updates->back();
       update.input_index = resource->arg_num();
       update.type = resource->type();
+      update.shape = resource->shape();
       update.modified = modified;
       for (const auto& grad : resource->tensor_array_gradients()) {
         update.tensor_array_gradients_accessed.insert(grad.first);
@@ -616,13 +676,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       ++computation_output;
     }
   }
-
-  for (std::vector<ResourceUpdate>::size_type i = 0;
-       i < result->resource_updates.size(); ++i) {
-    result->resource_updates[i].shape = xla::ShapeUtil::GetTupleElementShape(
-        result->xla_output_shape, computation_output);
-    ++computation_output;
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 6a46e54f61c..b86c82c0ab5 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -104,9 +104,17 @@ class XlaCompiler {
     // is the type of the variable's value, not DT_RESOURCE.
     DataType type;
 
-    // The shape of the argument. If the argument is a resource, this is the
-    // shape of the resource's value.
-    xla::Shape shape;
+    // The shape of the argument. For:
+    // * a parameter: the shape of the parameter.
+    // * a constant: ignored; the shape given by constant_value is used
+    //     instead.
+    // * an uninitialized resource: ignored. We don't yet know the shape of an
+    //     uninitialized resource (otherwise we would have initialized it!)
+    // * an initialized variable: the shape of the variable's value.
+    // * an initialized TensorArray or Stack resource: the shape of an entry in
+    //   the TensorArray/Stack. Note this is the size of a single entry, not the
+    //   XLA data structure that represents the complete stack/array.
+    TensorShape shape;
 
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
@@ -175,8 +183,9 @@ class XlaCompiler {
     int input_index;
 
     // Type and shape of the tensor to be written back.
+    // The `shape` field has the same meaning as the Argument::shape field.
     DataType type;
-    xla::Shape shape;
+    TensorShape shape;
 
     // Was the value of the variable modified by the computation?
     // (Always true, unless `return_updated_values_for_all_resources` is true.)
@@ -235,6 +244,19 @@ class XlaCompiler {
     // device is created, and can be used to create metadata objects
     // that can be accessed by XLA op kernels.
     std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
+
+    // If not nullptr, this memory allocator can be used by the compiler for
+    // temporary allocations it might want to make during compilation.
+    //
+    // For example, the compiler may want to try out different algorithms and
+    // choose the fastest one, and it might run those algorithms over buffers
+    // created using this allocator.
+    //
+    // The compiler can function correctly without an explicit allocator given
+    // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
+    // allocate most or all available memory on the device, leaving none for the
+    // compiler to access, unless it can use TensorFlow's allocator.
+    xla::DeviceMemoryAllocator* device_allocator = nullptr;
   };
 
   explicit XlaCompiler(Options options);
@@ -253,11 +275,10 @@ class XlaCompiler {
                       const std::vector<Argument>& args,
                       CompilationResult* result);
 
-  Status PrepareArguments(xla::ComputationBuilder* builder, NameAttrList func,
-                          const std::vector<DataType>& types,
-                          const std::vector<TensorShape>& shapes,
-                          const std::vector<const XlaExpression*>& expressions,
-                          std::vector<Argument>* args);
+  // Returns the shape of the XLA parameter for an argument 'arg'.
+  // See the class comment for more details about the argument passing
+  // convention.
+  static Status XLAShapeForArgument(const Argument& arg, xla::Shape* xla_shape);
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 7ebe4b75bc1..65de4dbad75 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -191,10 +191,10 @@ TEST_F(XlaCompilerTest, Simple) {
   std::vector<XlaCompiler::Argument> args(2);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
   args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
-  args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[1].shape = TensorShape({2});
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
@@ -242,10 +242,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
   std::vector<XlaCompiler::Argument> args(2);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
   args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
-  args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[1].shape = TensorShape({2});
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
@@ -281,7 +281,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
 
   XlaCompiler::Options options = DefaultOptions();
   XlaCompiler compiler(options);
@@ -373,7 +373,7 @@ TEST_F(XlaCompilerTest, ResourceManager) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
 
   DummyResourceForTest* resource = new DummyResourceForTest();
 
@@ -420,7 +420,7 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) {
     std::vector<XlaCompiler::Argument> args(1);
     args[0].kind = XlaCompiler::Argument::kParameter;
     args[0].type = DT_INT32;
-    args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+    args[0].shape = TensorShape({2});
 
     // Compiles the graph.
     auto options = DefaultOptions();
@@ -472,9 +472,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad2"};
 
@@ -540,9 +538,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
@@ -574,9 +570,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index e8d17e2e0a1..73878955e3f 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -103,12 +103,14 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
 
 xla::ComputationBuilder* XlaContext::builder() { return builder_; }
 
-Status XlaContext::CreateResource(XlaResource::Kind kind, int arg_num,
-                                  string name, DataType type,
-                                  const xla::ComputationDataHandle& handle,
-                                  XlaResource** resource) {
+Status XlaContext::CreateResource(
+    XlaResource::Kind kind, int arg_num, string name, DataType type,
+    TensorShape shape, const xla::ComputationDataHandle& handle,
+    int64 tensor_array_size, const std::set<string>& tensor_array_gradients,
+    XlaResource** resource) {
   resources_.emplace_back(
-      new XlaResource(kind, arg_num, std::move(name), type, handle));
+      new XlaResource(kind, arg_num, std::move(name), type, std::move(shape),
+                      handle, tensor_array_size, tensor_array_gradients));
   *resource = resources_.back().get();
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 1a7dafe8cdb..fac0352ae81 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -71,11 +71,15 @@ class XlaContext : public ResourceBase {
   Status AddConstRetval(int retval_index, DataType dtype,
                         const xla::Literal& literal);
 
-  // Creates a resource with resource `kind` and initial type `type` and
-  // value `handle`. `name` is a descriptive name for use in error messages.
+  // Creates a resource with resource `kind` and initial value `handle`. `name`
+  // is a descriptive name for use in error messages. See the `XlaResource`
+  // constructor for a description of the remaining arguments.
   // Fails if the resource already exists.
   Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
-                        DataType type, const xla::ComputationDataHandle& handle,
+                        DataType type, TensorShape shape,
+                        const xla::ComputationDataHandle& handle,
+                        int64 tensor_array_size,
+                        const std::set<string>& tensor_array_gradients,
                         XlaResource** resource);
 
   const std::vector<std::unique_ptr<XlaResource>>& resources() {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index ee0aed672e1..ee29158646f 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -286,7 +286,8 @@ Status XlaOpKernelContext::ConstantInputList(
 }
 
 Status XlaOpKernelContext::ReadVariableInput(
-    int index, xla::ComputationDataHandle* value) {
+    int index, DataType type, TensorShape* shape,
+    xla::ComputationDataHandle* value) {
   const Tensor& tensor = context_->input(index);
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
@@ -296,7 +297,15 @@ Status XlaOpKernelContext::ReadVariableInput(
     return errors::InvalidArgument("Read of uninitialized variable ",
                                    variable->name());
   }
+  if (variable->type() != type) {
+    return errors::InvalidArgument(
+        "Type mismatch for read of variable ", variable->name(), ". Expected ",
+        DataTypeString(type), "; got ", DataTypeString(variable->type()));
+  }
   *value = variable->value();
+  if (shape) {
+    *shape = variable->shape();
+  }
   return Status::OK();
 }
 
@@ -312,12 +321,7 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
                                    variable->name());
   }
   *type = variable->type();
-  auto shape_or_status = builder()->GetShape(variable->value());
-  if (!shape_or_status.ok()) {
-    return shape_or_status.status();
-  }
-  TF_RETURN_IF_ERROR(
-      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), shape));
+  *shape = variable->shape();
   return Status::OK();
 }
 
@@ -405,7 +409,17 @@ Status XlaOpKernelContext::AssignVariable(
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
-  return variable->SetValue(type, handle);
+
+  auto shape_or_status = builder()->GetShape(handle);
+  if (!shape_or_status.ok()) {
+    return shape_or_status.status();
+  }
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(
+      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+
+  TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
+  return variable->SetValue(handle);
 }
 
 XlaCompiler* XlaOpKernelContext::compiler() const {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 6d3b6db2289..e1fd0f55c6d 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -164,11 +164,16 @@ class XlaOpKernelContext {
                                  TensorShape* shape) const;
 
   // Reads the current value of the resouce variable referred to by input
-  // 'index'.
-  Status ReadVariableInput(int index, xla::ComputationDataHandle* value);
+  // 'index'. If `shape` is not nullptr, sets `*shape` to the shape of the
+  // variable. Returns an error if the variable has not been initialized, or if
+  // its type does not match `type`.
+  Status ReadVariableInput(int index, DataType type, TensorShape* shape,
+                           xla::ComputationDataHandle* value);
 
   // Assigns the value `handle` to the variable referenced by input
-  // `input_index`. Marks the operator as having side effects.
+  // `input_index`. The variable must be of `type`. Returns an error if the
+  // variable has been initialized with a different type or with a
+  // different shape.
   Status AssignVariable(int input_index, DataType type,
                         const xla::ComputationDataHandle& handle);
 
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 9abac8bdaa7..c2075b44b82 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -25,51 +25,99 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaResource::XlaResource(Kind kind, int arg_num, string name,
-                         DataType initial_type,
-                         const xla::ComputationDataHandle& initial_value)
+XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
+                         TensorShape shape,
+                         const xla::ComputationDataHandle& initial_value,
+                         int64 tensor_array_size,
+                         const std::set<string>& tensor_array_gradients)
     : kind_(kind),
       arg_num_(arg_num),
       name_(std::move(name)),
-      type_(initial_type),
+      type_(type),
+      shape_(std::move(shape)),
       value_(initial_value),
-      initial_value_(initial_value) {
+      initial_value_(initial_value),
+      tensor_array_size_(tensor_array_size) {
   CHECK(kind_ != kInvalid);
+
+  for (const string& gradient : tensor_array_gradients) {
+    tensor_array_gradients_[gradient].reset(
+        new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
+                        /*name=*/strings::StrCat("TensorArrayGrad: ", name_),
+                        type_, shape_, xla::ComputationDataHandle(),
+                        tensor_array_size_, /*tensor_array_gradients=*/{}));
+  }
 }
 
-Status XlaResource::SetValue(DataType type,
-                             const xla::ComputationDataHandle& value) {
-  if (type_ == DT_INVALID && type == DT_INVALID) {
-    return errors::InvalidArgument("Attempted to initialized resource ", name_,
-                                   " to an invalid type");
+Status XlaResource::SetTypeAndShape(DataType type, const TensorShape& shape) {
+  if (type == DT_INVALID) {
+    return errors::InvalidArgument("Attempted to set type of resource '", name_,
+                                   "'' to an invalid type");
   }
-  if (type_ != DT_INVALID && type_ != type) {
+  if (initialized() && type_ != type) {
     return errors::InvalidArgument("Type of resource ", name_,
                                    " cannot be changed after initialization: "
                                    "old type was ",
                                    DataTypeString(type_), ", new type is ",
                                    DataTypeString(type));
   }
+  if (initialized() && shape_ != shape) {
+    return errors::InvalidArgument("Shape of resource ", name_,
+                                   " cannot be changed after initialization: "
+                                   "old shape was ",
+                                   shape_.DebugString(), ", new shape is ",
+                                   shape.DebugString());
+  }
   type_ = type;
+  shape_ = shape;
+  return Status::OK();
+}
+
+Status XlaResource::SetValue(const xla::ComputationDataHandle& value) {
+  if (type_ == DT_INVALID) {
+    return errors::InvalidArgument(
+        "Resource '", name_,
+        "' must be initialized with a valid type before use.");
+  }
   value_ = value;
   return Status::OK();
 }
 
-Status XlaResource::GetXlaShape(xla::ComputationBuilder* builder,
-                                xla::Shape* shape) const {
-  auto shape_or_status = builder->GetShape(value_);
-  if (!shape_or_status.ok()) {
-    return shape_or_status.status();
+Status XlaResource::SetZeroValue(xla::ComputationBuilder* builder) {
+  if (type_ == DT_INVALID) {
+    return errors::InvalidArgument(
+        "Resource '", name_,
+        "' must be initialized with a valid type before use.");
   }
-  *shape = *shape_or_status.ValueOrDie();
-  return Status::OK();
-}
+  switch (kind_) {
+    case kVariable: {
+      value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                  shape_.dim_sizes());
+      break;
+    }
+    case kTensorArray: {
+      TensorShape ta_shape;
+      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AppendShape(shape_);
+      value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                  ta_shape.dim_sizes());
+      break;
+    }
+    case kStack: {
+      TensorShape ta_shape;
+      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AppendShape(shape_);
+      value_ =
+          builder->Tuple({builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                             ta_shape.dim_sizes()),
+                          builder->ConstantR0<int32>(0)});
+      break;
+    }
 
-Status XlaResource::GetShape(xla::ComputationBuilder* builder,
-                             TensorShape* shape) const {
-  xla::Shape xla_shape;
-  TF_RETURN_IF_ERROR(GetXlaShape(builder, &xla_shape));
-  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, shape));
+    case kInvalid:
+    default:
+      LOG(FATAL) << "Invalid resource type";
+  }
   return Status::OK();
 }
 
@@ -82,36 +130,20 @@ Status XlaResource::GetOrCreateTensorArrayGradient(
   std::unique_ptr<XlaResource>& gradient = tensor_array_gradients_[source];
   if (!gradient) {
     TensorShape ta_shape;
-    TF_RETURN_IF_ERROR(GetShape(builder, &ta_shape));
+    ta_shape.AddDim(tensor_array_size_);
+    ta_shape.AppendShape(shape_);
     xla::ComputationDataHandle gradient_value = builder->Broadcast(
         XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
     gradient.reset(
         new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
                         /*name=*/strings::StrCat("TensorArrayGrad: ", name_),
-                        type_, gradient_value));
-    gradient->tensor_array_size_ = tensor_array_size_;
+                        type_, shape_, gradient_value, tensor_array_size_,
+                        /*tensor_array_gradients=*/{}));
   }
   *gradient_out = gradient.get();
   return Status::OK();
 }
 
-Status XlaResource::PackedShape(xla::ComputationBuilder* builder,
-                                xla::Shape* packed_shape) const {
-  if (tensor_array_gradients_.empty()) {
-    return GetXlaShape(builder, packed_shape);
-  }
-  TF_RET_CHECK(kind_ == kTensorArray);
-  std::vector<xla::Shape> elem_shapes(1 + tensor_array_gradients_.size());
-  int pos = 0;
-  TF_RETURN_IF_ERROR(GetXlaShape(builder, &elem_shapes[pos++]));
-  for (const auto& gradient : tensor_array_gradients_) {
-    TF_RETURN_IF_ERROR(
-        gradient.second->GetXlaShape(builder, &elem_shapes[pos++]));
-  }
-  *packed_shape = xla::ShapeUtil::MakeTupleShape(elem_shapes);
-  return Status::OK();
-}
-
 Status XlaResource::Pack(xla::ComputationDataHandle* pack,
                          xla::ComputationBuilder* builder) const {
   if (tensor_array_gradients_.empty()) {
@@ -130,27 +162,32 @@ Status XlaResource::Pack(xla::ComputationDataHandle* pack,
 
 Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
                                 const xla::ComputationDataHandle& pack,
-                                bool reset_initial_values,
                                 xla::ComputationBuilder* builder) {
   if (gradient_sources.empty()) {
+    if (!initialized()) {
+      initial_value_ = pack;
+    }
     value_ = pack;
   } else {
     TF_RET_CHECK(kind_ == kTensorArray);
     int pos = 0;
-    value_ = builder->GetTupleElement(pack, pos++);
+    auto v = builder->GetTupleElement(pack, pos++);
+    if (!initialized()) {
+      initial_value_ = v;
+    }
+    value_ = v;
+
     for (const auto& source : gradient_sources) {
       XlaResource* gradient;
       TF_RETURN_IF_ERROR(
           GetOrCreateTensorArrayGradient(source, builder, &gradient));
-      gradient->value_ = builder->GetTupleElement(pack, pos++);
-      if (reset_initial_values) {
-        gradient->initial_value_ = gradient->value_;
+      auto v = builder->GetTupleElement(pack, pos++);
+      if (!gradient->initialized()) {
+        gradient->initial_value_ = v;
       }
+      gradient->value_ = v;
     }
   }
-  if (reset_initial_values) {
-    initial_value_ = value_;
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 6b46089e4f5..1bb2c7274ec 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -36,8 +36,11 @@ class XlaResource {
     kStack,
   };
 
-  XlaResource(Kind kind, int arg_num, string name, DataType initial_type,
-              const xla::ComputationDataHandle& initial_value);
+  XlaResource(Kind kind, int arg_num, string name, DataType type,
+              TensorShape shape,
+              const xla::ComputationDataHandle& initial_value,
+              int64 tensor_array_size,
+              const std::set<string>& tensor_array_gradients);
 
   XlaResource(const XlaResource&) = delete;
   XlaResource(XlaResource&&) = delete;
@@ -60,6 +63,12 @@ class XlaResource {
   // a resource is first initialized we do not yet know its type, so we keep
   // track of its type dynamically.
   DataType type() const { return type_; }
+
+  // Shape of the resource. For an uninitialized resource, this is ignored.
+  // For a Variable, this is the shape of the value. For a TensorArray or Stack
+  // this is the shape of each entry in the TensorArray/Stack.
+  const TensorShape& shape() const { return shape_; }
+
   const xla::ComputationDataHandle& value() const { return value_; }
 
   // Value of the resource at computation entry. Used to detect which
@@ -68,17 +77,19 @@ class XlaResource {
     return initial_value_;
   }
 
+  // A variable is initialized if it has a value.
   bool initialized() const { return value_.handle() > 0; }
 
-  // Sets the current type/value of the resource.
-  Status SetValue(DataType type, const xla::ComputationDataHandle& value);
+  // Sets the type and shape of the resource. The type and shape of a resource
+  // must not change once the variable has been initialized.
+  Status SetTypeAndShape(DataType type, const TensorShape& shape);
 
-  // Returns the shape of the resource as an xla::Shape.
-  Status GetXlaShape(xla::ComputationBuilder* builder, xla::Shape* shape) const;
+  // Sets the current value of the resource. Returns an error if the type is not
+  // set to a valid value.
+  Status SetValue(const xla::ComputationDataHandle& value);
 
-  // Returns the shape of the resource as an TensorShape. Fails if the shape is
-  // not representable as a TensorShape.
-  Status GetShape(xla::ComputationBuilder* builder, TensorShape* shape) const;
+  // Sets the current value of the resource to an all-zero value.
+  Status SetZeroValue(xla::ComputationBuilder* builder);
 
   // Looks up the gradient for `source`, or creates it if it does not already
   // exist. The call target must be an initialized TensorArray resource. A
@@ -96,10 +107,6 @@ class XlaResource {
   Status Pack(xla::ComputationDataHandle* pack,
               xla::ComputationBuilder* builder) const;
 
-  // Returns the shape of the `pack` value computed by `Pack()`.
-  Status PackedShape(xla::ComputationBuilder* builder,
-                     xla::Shape* packed_shape) const;
-
   // Updates the resource with values from `pack`. If `gradient_sources` is
   // non-empty, treats `pack` as a tuple that represents a TensorArray and
   // its gradients, and unpacks and updates the gradient resources.
@@ -108,14 +115,14 @@ class XlaResource {
   // Opposite of Pack().
   Status SetFromPack(const std::set<string>& gradient_sources,
                      const xla::ComputationDataHandle& pack,
-                     bool reset_initial_values,
                      xla::ComputationBuilder* builder);
 
-  // TensorArray-specific fields
+  // TensorArray and Stack specific fields
 
   // 'tensor_array_size' stores the expected size of the TensorArray or Stack.
   // We need to store this since sometimes TensorArrays must be initialized
   // lazily since we do not know the element shape at construction time.
+  // Used by both TensorArrays and Stacks.
   int64 tensor_array_size() const { return tensor_array_size_; }
   void set_tensor_array_size(int64 size) { tensor_array_size_ = size; }
 
@@ -136,6 +143,7 @@ class XlaResource {
   const string name_;
 
   DataType type_;
+  TensorShape shape_;
   xla::ComputationDataHandle value_;
   xla::ComputationDataHandle initial_value_;
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index c22fd37129c..34e733bc8d8 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -88,7 +88,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 952109dde2d..02356699a25 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -80,6 +80,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "executable_build_options",
+    srcs = ["executable_build_options.cc"],
+    hdrs = ["executable_build_options.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "local_client",
     srcs = ["local_client.cc"],
@@ -87,6 +99,7 @@ cc_library(
     deps = [
         ":client",
         ":computation",
+        ":executable_build_options",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
new file mode 100644
index 00000000000..804e34f5e75
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
+    DeviceMemoryAllocator* allocator) {
+  device_allocator_ = allocator;
+  return *this;
+}
+
+DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+  return device_allocator_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
+    int device_ordinal) {
+  CHECK_GE(device_ordinal, 0);
+  device_ordinal_ = device_ordinal;
+  return *this;
+}
+
+int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
+    const Shape& shape_with_layout) {
+  result_layout_set_ = true;
+  result_layout_ = shape_with_layout;
+  return *this;
+}
+
+const Shape* ExecutableBuildOptions::result_layout() const {
+  return result_layout_set_ ? &result_layout_ : nullptr;
+}
+
+string ExecutableBuildOptions::ToString() const {
+  string result_layout = "nullopt";
+  if (result_layout_set_) {
+    result_layout = ShapeUtil::HumanStringWithLayout(result_layout_);
+  }
+  string generate_hlo_graph = "nullopt";
+  if (generate_hlo_graph_.has_value()) {
+    generate_hlo_graph = generate_hlo_graph_.value();
+  }
+  return tensorflow::strings::Printf(
+      "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
+      "generate_hlo_graph=%s}",
+      device_ordinal_, result_layout.c_str(), generate_hlo_graph.c_str());
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
+    string regex) {
+  generate_hlo_graph_ = std::move(regex);
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::generate_hlo_graph() const {
+  return generate_hlo_graph_;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
new file mode 100644
index 00000000000..3a52dbac9ad
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+
+namespace xla {
+
+// Class containing options for building an LocalExecutable with
+// LocalClient::Compile.
+class ExecutableBuildOptions {
+ public:
+  // If set, this is the device to build the computation for. Valid
+  // device_ordinal values are: 0 to # of devices - 1. These values are
+  // identical to the device ordinal values used by StreamExecutor. The built
+  // executable will be executable on any device equivalent to the specified
+  // device as determined by Backend::devices_equivalent(). A value of -1
+  // indicates this option has not been set.
+  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
+  int device_ordinal() const;
+
+  // If set, this specifies the layout of the result of the computation. If not
+  // set, the service will chose the layout of the result. A Shape is used to
+  // store the layout to accommodate tuple result shapes. A value of nullptr
+  // indicates the option has not been set.
+  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
+  const Shape* result_layout() const;
+
+  // If set, this specifies an allocator that can be used to allocate temporary
+  // space on the device during compilation.  For example, the compiler might
+  // want to run various algorithms on the device and pick the fastest one -- it
+  // might allocate buffers for use by these algorithms using this allocator.
+  //
+  // This does not need to be the same as the DeviceMemoryAllocator passed when
+  // running the executable.
+  ExecutableBuildOptions& set_device_allocator(
+      DeviceMemoryAllocator* allocator);
+  DeviceMemoryAllocator* device_allocator() const;
+
+  // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
+  ExecutableBuildOptions& set_generate_hlo_graph(string regex);
+  const tensorflow::gtl::optional<string>& generate_hlo_graph() const;
+
+  // Returns a string representation of the build options, suitable for
+  // debugging.
+  string ToString() const;
+
+ private:
+  int device_ordinal_ = -1;
+  Shape result_layout_;
+  bool result_layout_set_ = false;
+  tensorflow::gtl::optional<string> generate_hlo_graph_;
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index fbeedfcecdd..ef98dbb6403 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -30,25 +30,6 @@ using xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
 
-ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
-    int device_ordinal) {
-  device_ordinal_ = device_ordinal;
-  return *this;
-}
-
-int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
-    const Shape& shape_with_layout) {
-  result_layout_set_ = true;
-  result_layout_ = shape_with_layout;
-  return *this;
-}
-
-const Shape* ExecutableBuildOptions::result_layout() const {
-  return result_layout_set_ ? &result_layout_ : nullptr;
-}
-
 namespace {
 StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
                                                    Backend* backend) {
@@ -60,16 +41,18 @@ StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
 }  // namespace
 
 LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
-                                 Backend* backend, int device_ordinal,
-                                 const ExecutableBuildOptions& build_options)
+                                 Backend* backend,
+                                 ExecutableBuildOptions build_options)
     : executable_(std::move(executable)),
       backend_(backend),
-      build_device_ordinal_(device_ordinal),
-      build_options_(build_options) {}
+      build_options_(std::move(build_options)) {
+  CHECK_GE(build_options_.device_ordinal(), 0)
+      << "Must have a valid device ordinal that the executable was built for.";
+}
 
 tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options, const Backend& backend) {
+    const ExecutableRunOptions& run_options, const Backend& backend) {
   const ComputationLayout& computation_layout =
       executable_->module_config().entry_computation_layout();
 
@@ -93,14 +76,14 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     }
   }
 
-  if (options.stream() != nullptr) {
-    if (!options.stream()->ok()) {
+  if (run_options.stream() != nullptr) {
+    if (!run_options.stream()->ok()) {
       return InvalidArgument("stream is uninitialized or in an error state");
     }
 
     // Check stream matches service platform.
     const se::Platform* stream_platform =
-        options.stream()->parent()->platform();
+        run_options.stream()->parent()->platform();
     if (stream_platform != backend_->platform()) {
       return InvalidArgument(
           "stream is for platform %s, but service targets platform %s",
@@ -110,7 +93,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
     // Cannot specify device_ordinal with a stream. The stream determines these
     // values.
-    if (options.device_ordinal() != -1) {
+    if (run_options.device_ordinal() != -1) {
       return InvalidArgument(
           "cannot set both device ordinal and stream options in "
           "ExecutableRunOptions; the stream determines the device ordinal");
@@ -119,34 +102,34 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
   // Verify that the device the executable was built for is equivalent to the
   // device it will run on.
-  int run_device_ordinal = options.device_ordinal() == -1
+  int run_device_ordinal = run_options.device_ordinal() == -1
                                ? backend_->default_device_ordinal()
-                               : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(
-      bool devices_equivalent,
-      backend_->devices_equivalent(run_device_ordinal, build_device_ordinal_));
+                               : run_options.device_ordinal();
+  TF_ASSIGN_OR_RETURN(bool devices_equivalent,
+                      backend_->devices_equivalent(
+                          run_device_ordinal, build_options_.device_ordinal()));
   if (!devices_equivalent) {
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * run_executor,
                         backend_->stream_executor(run_device_ordinal));
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * build_executor,
-                        backend_->stream_executor(build_device_ordinal_));
+                        backend_->stream_executor(build_device_ordinal()));
     return InvalidArgument(
         "executable is built for device %s of type \"%s\"; cannot run it on "
         "device %s of type \"%s\"",
-        backend_->device_name(build_device_ordinal_).c_str(),
+        backend_->device_name(build_device_ordinal()).c_str(),
         build_executor->GetDeviceDescription().name().c_str(),
         backend_->device_name(run_device_ordinal).c_str(),
         run_executor->GetDeviceDescription().name().c_str());
   }
 
-  if (!options.allocator()) {
+  if (!run_options.allocator()) {
     return InvalidArgument("an allocator must be provided to ExecuteLocally");
   }
 
-  if (options.allocator()->platform() != backend.platform()) {
+  if (run_options.allocator()->platform() != backend.platform()) {
     return InvalidArgument(
         "allocator platform (%s) does not match service platform (%s)",
-        options.allocator()->platform()->Name().c_str(),
+        run_options.allocator()->platform()->Name().c_str(),
         backend.platform()->Name().c_str());
   }
 
@@ -155,23 +138,22 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options) {
-  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
-
-  ExecutableRunOptions actual_options = options;
+    ExecutableRunOptions run_options) {
+  TF_RETURN_IF_ERROR(
+      ValidateExecutionOptions(arguments, run_options, *backend_));
 
   Backend::StreamPtr stream;
-  if (options.stream() == nullptr) {
+  if (run_options.stream() == nullptr) {
     // NB!  The lifetime of `stream` needs to match the lifetime of
     // `actual_options` (otherwise we will end up using a returned stream in
     // ExecuteOnStreamWrapper), which is why it isn't declared in the inner "if"
     // scope.
     TF_ASSIGN_OR_RETURN(
-        stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
-    actual_options.set_stream(stream.get());
+        stream, BorrowStreamForDevice(run_options.device_ordinal(), backend_));
+    run_options.set_stream(stream.get());
   }
-  if (options.allocator() == nullptr) {
-    actual_options.set_allocator(backend_->memory_allocator());
+  if (run_options.allocator() == nullptr) {
+    run_options.set_allocator(backend_->memory_allocator());
   }
 
   // For local client execution on CPU backends:
@@ -180,7 +162,7 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
   ServiceExecutableRunOptions service_options(
-      actual_options, backend_->StreamBorrower(),
+      run_options, backend_->StreamBorrower(),
       backend_->eigen_intra_op_thread_pool());
 
   if (executable_->dumping()) {
@@ -189,9 +171,8 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ShapedBuffer> result,
       executable_->ExecuteOnStreamWrapper(
-          &service_options, options.execution_profile(), arguments));
-  return ScopedShapedBuffer::MakeScoped(result.get(),
-                                        actual_options.allocator());
+          &service_options, run_options.execution_profile(), arguments));
+  return ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator());
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::ExecuteAndDump(
@@ -267,16 +248,19 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
     const Computation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
     const ExecutableBuildOptions& options) {
-  int device_ordinal = options.device_ordinal() == -1
-                           ? default_device_ordinal()
-                           : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      local_service_->CompileExecutable(
-                          computation.handle(), argument_layouts,
-                          options.result_layout(), device_ordinal));
+  ExecutableBuildOptions updated_options = options;
+  if (options.device_ordinal() == -1) {
+    updated_options.set_device_ordinal(default_device_ordinal());
+    VLOG(3) << "Set device ordinal to default value of: "
+            << updated_options.device_ordinal();
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      local_service_->CompileExecutable(computation.handle(), argument_layouts,
+                                        updated_options));
   return WrapUnique(new LocalExecutable(std::move(executable),
                                         local_service_->mutable_backend(),
-                                        device_ordinal, options));
+                                        updated_options));
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>>
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 19fd14f76bc..b52a30f5a0b 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -33,39 +34,13 @@ limitations under the License.
 
 namespace xla {
 
-// Class containing options for building an LocalExecutable with
-// LocalClient::Compile.
-class ExecutableBuildOptions {
- public:
-  // If set, this is the device to build the computation for. Valid
-  // device_ordinal values are: 0 to # of devices - 1. These values are
-  // identical to the device ordinal values used by StreamExecutor. The built
-  // executable will be executable on any device equivalent to the specified
-  // device as determined by Backend::devices_equivalent(). A value of -1
-  // indicates this option has not been set.
-  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
-  int device_ordinal() const;
-
-  // If set, this specifies the layout of the result of the computation. If not
-  // set, the service will chose the layout of the result. A Shape is used to
-  // store the layout to accommodate tuple result shapes. A value of nullptr
-  // indicates the option has not been set.
-  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
-  const Shape* result_layout() const;
-
- private:
-  int device_ordinal_ = -1;
-  Shape result_layout_;
-  bool result_layout_set_ = false;
-};
-
 class LocalExecutable {
  public:
   // Run the compiled computation with the given arguments and options and
   // return the result.
   StatusOr<std::unique_ptr<ScopedShapedBuffer>> Run(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options);
+      ExecutableRunOptions run_options);
 
   // Return the layout (contained in a shape) of the result produced by the
   // computation.
@@ -88,8 +63,7 @@ class LocalExecutable {
 
   // Constructor invoked by LocalClient.
   LocalExecutable(std::unique_ptr<Executable> executable, Backend* backend,
-                  int device_ordinal,
-                  const ExecutableBuildOptions& build_options);
+                  ExecutableBuildOptions build_options);
 
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
@@ -117,19 +91,19 @@ class LocalExecutable {
   StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
       const ShapedBuffer& shaped_buffer);
 
+  // The ordinal of the device which this executable was compiled for. The
+  // executable can run on all equivalent devices (as determined by
+  // Backend::devices_equivalent).
+  int build_device_ordinal() const { return build_options_.device_ordinal(); }
+
   // Compiled computation.
   std::unique_ptr<Executable> executable_;
 
   // Execution backend.
-  Backend* backend_;
-
-  // The ordinal of the device which this executable was compiled for. The
-  // executable can run on all equivalent devices (as determined by
-  // Backend::devices_equivalent).
-  int build_device_ordinal_;
+  Backend* backend_ = nullptr;
 
   // Options used to build the executable.
-  const ExecutableBuildOptions& build_options_;
+  const ExecutableBuildOptions build_options_;
 };
 
 // An XLA Client specialization for use when the client and service run in
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index fe3a4d2f6df..c8ed3e3a2b0 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -221,13 +221,19 @@ void AllocateFlags() {
           flag_values->xla_gpu_disable_multi_streaming(),
           "If true, multi-streaming in the GPU backend is disabled."),
       tensorflow::Flag(
-          "xla_dump_hlo_proto_to", flag_values->mutable_xla_dump_hlo_proto_to(),
-          "Dump compilation artifacts as proto binary into this directory."),
+          "xla_dump_optimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
+          "Dump Hlo after all hlo passes are executed as proto binary into "
+          "this directory."),
       tensorflow::Flag(
-          "xla_dump_prepass_hlo_proto_to",
-          flag_values->mutable_xla_dump_prepass_hlo_proto_to(),
-          "Dump compilation artifacts, before hlo passes are executed, as "
-          "proto binary into this directory."),
+          "xla_dump_unoptimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_unoptimized_hlo_proto_to(),
+          "Dump HLO before any hlo passes are executed as proto binary into "
+          "this directory."),
+      tensorflow::Flag("xla_dump_per_pass_hlo_proto_to",
+                       flag_values->mutable_xla_dump_per_pass_hlo_proto_to(),
+                       "Dump HLO after each pass as an HloProto in binary file "
+                       "format into this directory."),
       tensorflow::Flag(
           "xla_test_all_output_layouts",
           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index e0196509a74..2b68b8f177d 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -486,6 +486,7 @@ class Literal {
       std::vector<std::unique_ptr<Literal>> elements);
 
   // Returns a string representation of the literal value.
+  // Warning: this function can take minutes for multi-million element Literals.
   string ToString(bool print_layout = false) const;
 
   // Invokes the "per cell" callback for each element in the provided
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index a8ca0e3ea01..e2972f06016 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 37f1eada2bc..8386acf0cd4 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -98,15 +98,25 @@ const std::unique_ptr<ScopedShapedBuffer>& LocalShapedBuffer::shaped_buffer()
   return shaped_buffer_;
 }
 
+static StatusOr<std::unique_ptr<ScopedShapedBuffer>> ToBuffer(
+    LocalClient* client, int device_ordinal, const Literal& arg) {
+  return client->LiteralToShapedBuffer(arg, device_ordinal,
+                                       client->backend().memory_allocator());
+}
+
 /* static */
-LocalShapedBuffer* LocalShapedBuffer::FromLiteral(const Literal& argument) {
+LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
+    const Literal& argument,
+    const tensorflow::gtl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
-  std::unique_ptr<ScopedShapedBuffer> buf =
-      client
-          ->LiteralToShapedBuffer(argument,
-                                  /*device_ordinal=*/0,
-                                  client->backend().memory_allocator())
-          .ConsumeValueOrDie();
+  std::unique_ptr<ScopedShapedBuffer> buf;
+  if (shape_with_layout) {
+    std::unique_ptr<Literal> relaid =
+        argument.Relayout(shape_with_layout.value());
+    buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie();
+  } else {
+    buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+  }
   return new LocalShapedBuffer(std::move(buf));
 }
 
@@ -120,7 +130,8 @@ CompiledLocalComputation::CompiledLocalComputation(
     : executable_(std::move(executable)) {}
 
 StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
-    const std::vector<Literal>& arguments) {
+    const std::vector<Literal>& arguments,
+    const std::vector<tensorflow::gtl::optional<Shape>>& shapes_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
 
   VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
@@ -133,7 +144,8 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
                                         GetReplicaCount());
 
     for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-      pool.Schedule([this, client, replica, &arguments, &results] {
+      pool.Schedule([this, client, replica, &arguments, &shapes_with_layout,
+                     &results] {
         StatusOr<int> device_ordinal_status =
             client->ReplicaNumberToDeviceOrdinal(replica);
         if (!device_ordinal_status.ok()) {
@@ -144,18 +156,28 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         VLOG(3) << "Replica " << replica
                 << " mapped to device ordinal for execution: "
                 << device_ordinal;
+
         // Transfer arguments in
         std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
         scoped_buffers.reserve(arguments.size());
-        for (const Literal& argument : arguments) {
-          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed =
-              client->LiteralToShapedBuffer(
-                  argument, device_ordinal,
-                  client->backend().memory_allocator());
+        for (int i = 0; i < arguments.size(); ++i) {
+          const Literal& argument = arguments[i];
+          const tensorflow::gtl::optional<Shape>& shape_with_layout =
+              shapes_with_layout[i];
+
+          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed;
+          if (shape_with_layout) {
+            std::unique_ptr<Literal> relaid =
+                argument.Relayout(shape_with_layout.value());
+            pushed = ToBuffer(client, device_ordinal, *relaid);
+          } else {
+            pushed = ToBuffer(client, device_ordinal, argument);
+          }
           if (!pushed.ok()) {
             results[replica] = pushed.status();
             return;
           }
+
           scoped_buffers.push_back(std::move(pushed).ValueOrDie());
         }
 
@@ -233,7 +255,8 @@ LocalComputation::LocalComputation(Computation computation)
     : computation_(std::move(computation)) {}
 
 StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
-    const std::vector<Shape>& argument_shapes) {
+    const std::vector<Shape>& argument_shapes,
+    const ExecutableBuildOptions* build_options) {
   std::vector<const Shape*> argument_shape_pointers;
   argument_shape_pointers.reserve(argument_shapes.size());
   for (auto& argument_shape : argument_shapes) {
@@ -242,6 +265,9 @@ StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
 
   LocalClient* client = GetOrCreateLocalClient();
   ExecutableBuildOptions options;
+  if (build_options != nullptr) {
+    options = *build_options;
+  }
   TF_ASSIGN_OR_RETURN(
       auto local_executable,
       client->Compile(computation_, argument_shape_pointers, options));
@@ -363,12 +389,6 @@ LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
       source, init_value, scatter.computation());
 }
 
-ComputationDataHandle LocalComputationBuilder::Select(
-    const ComputationDataHandle& pred, const ComputationDataHandle& on_true,
-    const ComputationDataHandle& on_false) {
-  return builder_.Select(pred, on_true, on_false);
-}
-
 ComputationDataHandle LocalComputationBuilder::Tuple(
     tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
   return builder_.Tuple(elements);
@@ -384,6 +404,12 @@ ComputationDataHandle LocalComputationBuilder::Dot(
   return builder_.Dot(lhs, rhs);
 }
 
+ComputationDataHandle LocalComputationBuilder::DotGeneral(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  return builder_.DotGeneral(lhs, rhs, dimension_numbers);
+}
+
 ComputationDataHandle LocalComputationBuilder::ConvGeneralDilated(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
@@ -483,6 +509,15 @@ ComputationDataHandle LocalComputationBuilder::While(
        tensorflow::gtl::ArraySlice<int64> broadcast_dimensions),           \
       (lhs, rhs, broadcast_dimensions))
 
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs),                                  \
+      (lhs, rhs, ehs))
+
+_FORWARD_TRIOP(Select)
+_FORWARD_TRIOP(Clamp)
 _FORWARD_BINOP(Eq)
 _FORWARD_BINOP(Ne)
 _FORWARD_BINOP(Ge)
@@ -503,6 +538,7 @@ _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
 _FORWARD_UNOP(Floor)
 _FORWARD_UNOP(Ceil)
+_FORWARD_UNOP(Round)
 _FORWARD_UNOP(Log)
 _FORWARD_UNOP(Sign)
 _FORWARD_UNOP(Cos)
@@ -519,6 +555,7 @@ _FORWARD_UNOP(Sort)
 #undef _FORWARD
 #undef _FORWARD_UNOP
 #undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
 
 void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
   delete local_shaped_buffer;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index e5503cd52fa..f39d15cff70 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -58,7 +59,9 @@ StatusOr<std::unique_ptr<Literal> > TransferFromOutfeedLocalReplica(
 // client.
 class LocalShapedBuffer {
  public:
-  static LocalShapedBuffer* FromLiteral(const Literal& argument);
+  static LocalShapedBuffer* FromLiteral(
+      const Literal& argument,
+      const tensorflow::gtl::optional<Shape>& shape_with_layout);
   LocalShapedBuffer(std::unique_ptr<ScopedShapedBuffer> shaped_buffer);
   const std::unique_ptr<ScopedShapedBuffer>& shaped_buffer() const;
   std::unique_ptr<Literal> ToLiteral() const;
@@ -76,8 +79,15 @@ class LocalShapedBuffer {
 class CompiledLocalComputation {
  public:
   CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+
+  // Execute the computation with the given argument literals, and
+  // with optionally-specified argument layouts. The literals will be
+  // re-laid out according to the corresponding elements of
+  // shapes_with_layout.
   StatusOr<std::unique_ptr<Literal> > Execute(
-      const std::vector<Literal>& arguments);
+      const std::vector<Literal>& arguments,
+      const std::vector<tensorflow::gtl::optional<Shape> >& shapes_with_layout);
+
   LocalShapedBuffer* ExecuteWithShapedBuffers(
       tensorflow::gtl::ArraySlice<LocalShapedBuffer*> argument_handles);
 
@@ -93,7 +103,8 @@ class LocalComputation {
  public:
   LocalComputation(Computation computation);
   StatusOr<CompiledLocalComputation*> Compile(
-      const std::vector<Shape>& argument_shapes);
+      const std::vector<Shape>& argument_shapes,
+      const ExecutableBuildOptions* build_options);
   const Computation& computation() const;
 
  private:
@@ -172,10 +183,6 @@ class LocalComputationBuilder {
       const ComputationDataHandle& source,
       const ComputationDataHandle& init_value, const LocalComputation& scatter);
 
-  ComputationDataHandle Select(const ComputationDataHandle& pred,
-                               const ComputationDataHandle& on_true,
-                               const ComputationDataHandle& on_false);
-
   ComputationDataHandle Tuple(
       tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
 
@@ -185,6 +192,10 @@ class LocalComputationBuilder {
   ComputationDataHandle Dot(const ComputationDataHandle& lhs,
                             const ComputationDataHandle& rhs);
 
+  ComputationDataHandle DotGeneral(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
   ComputationDataHandle ConvGeneralDilated(
       const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
@@ -252,6 +263,14 @@ class LocalComputationBuilder {
       (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
        tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
 
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs))
+
+  _FORWARD_TRIOP(Select)
+  _FORWARD_TRIOP(Clamp)
   _FORWARD_BINOP(Eq)
   _FORWARD_BINOP(Ne)
   _FORWARD_BINOP(Ge)
@@ -272,6 +291,7 @@ class LocalComputationBuilder {
   _FORWARD_UNOP(Exp)
   _FORWARD_UNOP(Floor)
   _FORWARD_UNOP(Ceil)
+  _FORWARD_UNOP(Round)
   _FORWARD_UNOP(Log)
   _FORWARD_UNOP(Sign)
   _FORWARD_UNOP(Cos)
@@ -288,6 +308,7 @@ class LocalComputationBuilder {
 #undef _FORWARD
 #undef _FORWARD_UNOP
 #undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
 
  private:
   ComputationBuilder builder_;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 31789259609..5ea75550c96 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -27,12 +27,14 @@ limitations under the License.
 //  ArraySlice<ComputationDataHandle>  <-  sequence of int
 //  Literal                            <-> (nested tuple of) numpy ndarray
 //  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
-//  Shape                              <-> pair holding (dtype, dimensions)
-//  std::vector<Shape>                 <-  sequence of shape information pairs
+//  Shape                               -> pair holding (dtype, dimensions)
+//                                     <-  object duck-typed as xla_client.Shape
+//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
 //  PrimitiveType                      <-  int
 //  ArraySlice<pair<int64, in64>>      <-  sequence of int pairs
 //  PaddingConfig proto                <-  corresponding Python proto
 //  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
+//  DotDimensionNumbers proto          <-  corresponding Python proto
 //
 // Arrows indicate whether a conversion only ever occurs in one
 // direction, or whether it is maintained bidirectionally.
@@ -55,7 +57,7 @@ limitations under the License.
 // translates to a tuple-shaped XLA Literal, whose component subshapes
 // are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
 //
-// The Python objects corresponding to C++ Shapes have the type:
+// Shapes output by C++ become Python objects with the type:
 //
 //   T            = (dtype, S)
 //   S            = DIMENSIONS | TUPLE_SHAPES
@@ -176,6 +178,16 @@ tensorflow::ImportNumpy();
   }
 }
 
+%typemap(out) StatusOr< std::unique_ptr<Literal> > {
+  if ($1.ok()) {
+    std::unique_ptr<Literal> value = $1.ConsumeValueOrDie();
+    $result = numpy::PyObjectFromXlaLiteral(*value);
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+}
+
 %typemap(out) StatusOr<xla::swig::LocalComputation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
@@ -343,15 +355,31 @@ tensorflow::ImportNumpy();
 // Shape
 
 %typemap(in) const Shape& (Shape temp) {
-  Status shape_status = numpy::CheckPyShapeInfo($input);
-  if (!shape_status.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str());
+  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
     return NULL;
   }
-  temp = numpy::XlaShapeFromPyShapeInfo($input);
+  temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
 }
 
+%typemap(in) const tensorflow::gtl::optional<Shape>& (
+    tensorflow::gtl::optional<Shape> temp) {
+  if ($input == Py_None) {
+    temp = tensorflow::gtl::nullopt;
+    $1 = &temp;
+  } else {
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      return NULL;
+    }
+    temp = std::move(statusor).ValueOrDie();
+    $1 = &temp;
+  }
+}
+
 %typemap(out) std::unique_ptr<Shape> {
   $result = numpy::PyShapeInfoFromXlaShape(*$1);
 }
@@ -364,14 +392,37 @@ tensorflow::ImportNumpy();
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    Status shape_status = numpy::CheckPyShapeInfo(o);
-    if (!shape_status.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, shape_status.ToString().c_str());
-      Py_DECREF(o);
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+    Py_DECREF(o);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
       return NULL;
     }
-    temps.push_back(numpy::XlaShapeFromPyShapeInfo(o));
-    Py_DECREF(o);
+    temps.push_back(statusor.ConsumeValueOrDie());
+  }
+  $1 = &temps;
+}
+
+%typemap(in) const std::vector<tensorflow::gtl::optional<Shape> >& (
+    std::vector<tensorflow::gtl::optional<Shape> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (o == Py_None) {
+      temps.push_back(tensorflow::gtl::nullopt);
+    } else {
+      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+      Py_DECREF(o);
+      if (!statusor.ok()) {
+        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+        return NULL;
+      }
+      temps.push_back(statusor.ConsumeValueOrDie());
+    }
   }
   $1 = &temps;
 }
@@ -461,6 +512,135 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
+// DotDimensionNumbers
+
+%typemap(in) const DotDimensionNumbers&
+    (DotDimensionNumbers dimension_numbers) {
+  int length;
+
+  /* lhs_contracting_dimensions */
+  PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "lhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_contracting_dimensions);
+
+  /* rhs_contracting_dimensions */
+  PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "rhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_contracting_dimensions);
+
+  /* lhs_batch_dimensions */
+  PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "lhs_batch_dimensions");
+  if (!lhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_batch_dimensions);
+
+  /* rhs_batch_dimensions */
+  PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "rhs_batch_dimensions");
+  if (!rhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_batch_dimensions);
+
+  $1 = &dimension_numbers;
+}
+
 // PaddingConfig
 
 %typemap(in) const PaddingConfig&
@@ -623,6 +803,30 @@ tensorflow::ImportNumpy();
   $1 = &dimension_numbers;
 }
 
+// ExecutableBuildOptions
+
+%typemap(in) const ExecutableBuildOptions*
+    (ExecutableBuildOptions build_options) {
+  if ($input == Py_None) {
+    $1 = NULL;
+  } else {
+    PyObject* o = PyObject_GetAttrString($input, "generate_hlo_graph");
+    if (!o) {
+      return NULL;
+    }
+    if (o != Py_None) {
+      if (!PyString_Check(o)) {
+        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.generate_hlo_graph must be a string or None.");
+        return NULL;
+      }
+      build_options.set_generate_hlo_graph(PyString_AsString(o));
+    }
+    Py_DECREF(o);
+
+    $1 = &build_options;
+  }
+}
+
 %ignoreall
 %unignore xla;
 %unignore xla::swig;
@@ -667,6 +871,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Call;
 %unignore xla::swig::LocalComputationBuilder::Transpose;
 %unignore xla::swig::LocalComputationBuilder::Rev;
+%unignore xla::swig::LocalComputationBuilder::Clamp;
 %unignore xla::swig::LocalComputationBuilder::Map;
 %unignore xla::swig::LocalComputationBuilder::Reduce;
 %unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding;
@@ -681,6 +886,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Lt;
 %unignore xla::swig::LocalComputationBuilder::Le;
 %unignore xla::swig::LocalComputationBuilder::Dot;
+%unignore xla::swig::LocalComputationBuilder::DotGeneral;
 %unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated;
 %unignore xla::swig::LocalComputationBuilder::Add;
 %unignore xla::swig::LocalComputationBuilder::Sub;
@@ -696,6 +902,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Exp;
 %unignore xla::swig::LocalComputationBuilder::Floor;
 %unignore xla::swig::LocalComputationBuilder::Ceil;
+%unignore xla::swig::LocalComputationBuilder::Round;
 %unignore xla::swig::LocalComputationBuilder::Log;
 %unignore xla::swig::LocalComputationBuilder::Sign;
 %unignore xla::swig::LocalComputationBuilder::Cos;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index 5c722623e31..3d87480728a 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -176,85 +176,107 @@ static string PyObjectCppRepr(PyObject* o) {
   return ExtractStringAndDecref(r);
 }
 
-Status CheckPyShapeInfo(PyObject* o) {
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
   auto error = [o](const string& prefix) {
     return InvalidArgument("%s; got %s", prefix.c_str(),
                            PyObjectCppRepr(o).c_str());
   };
-  // The object is a tuple (a pair)
-  if (!PyTuple_Check(o)) {
-    return error("Shape record must be a tuple");
-  }
-  if (PyTuple_Size(o) != 2) {
-    return error("Shape record tuple must be of length 2");
-  }
 
-  // It has a first element, which is a numpy dtype object
-  PyObject* first = PyTuple_GetItem(o, 0);
-  if (first == nullptr) {
-    return error("Tuple has no item 0 (shape dtype)");
-  }
-  if (first->ob_type != &PyArrayDescr_Type) {
-    return error(
-        "Shape record does not have a numpy dtype as its first element");
-  }
-  const int np_type = NumpyTypenum(first);
-  if (!NumpyTypeIsValid(np_type)) {
-    return error("Shape record has an invalid integer dtype");
-  }
-
-  // It has a second element, which is a tuple, either of shape
-  // records or of Python ints
-  PyObject* second = PyTuple_GetItem(o, 1);
-  if (!second) {
-    return error("Tuple has no item 0 (shape dimensions)");
-  }
-  if (!PyTuple_Check(second)) {
-    return error("Shape record does not have a tuple as its second element");
-  }
-  const int length = PyTuple_Size(second);
-  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
-  for (int i = 0; i < length; i++) {
-    PyObject* dimension = PyTuple_GetItem(second, i);
-    if (element_type == TUPLE) {
-      VLOG(3) << "element_type is tuple, checking member: " << i;
-      Status result = CheckPyShapeInfo(dimension);
-      if (!result.ok()) {
-        return AddStatus(
-            result, tensorflow::strings::StrCat("Validating tuple member ", i,
-                                                " of ", PyObjectCppRepr(o)));
-      }
-    } else if (!CheckPyIntOrLong(dimension)) {
-      return error("Non-tuple shape record has a non-integer dimension");
+  auto get_attr = [o, &error](const string& field) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_GetAttrString(o, const_cast<char*>(field.c_str()));
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to get attribute of Shape object:", field));
     }
+    return result;
+  };
+
+  auto call_method = [o, &error](const string& method) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to call method of shape object:", method));
+    }
+    return result;
+  };
+
+  PyObject* np_type;
+  TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype"));
+  if (np_type->ob_type != &PyArrayDescr_Type) {
+    return error("Shape attribute np_dtype is not an integer numpy dtype");
   }
+  if (!NumpyTypeIsValid(NumpyTypenum(np_type))) {
+    return error("Shape attribute np_dtype is not a valid integer numpy dtype");
+  }
+  const PrimitiveType element_type =
+      NumpyTypeToPrimitiveType(NumpyTypenum(np_type));
+  Py_DECREF(np_type);
 
-  return Status::OK();
-}
-
-// Precondition: CheckPyShapeInfo(o)
-Shape XlaShapeFromPyShapeInfo(PyObject* o) {
-  const int np_type = NumpyTypenum(PyTuple_GetItem(o, 0));
-  const PrimitiveType element_type = NumpyTypeToPrimitiveType(np_type);
-  PyObject* py_dimensions = PyTuple_GetItem(o, 1);
-  const int length = PyTuple_Size(py_dimensions);
   if (element_type == TUPLE) {
+    PyObject* py_subshapes;
+    TF_ASSIGN_OR_RETURN(py_subshapes, call_method("tuple_shapes"));
+    if (!PyTuple_Check(py_subshapes)) {
+      return error(
+          "Return value of Shape method tuple_shapes() is not a tuple");
+    }
+    const int length = PyTuple_Size(py_subshapes);
     std::vector<Shape> subshapes;
     subshapes.reserve(length);
     for (int i = 0; i < length; i++) {
-      subshapes.push_back(
-          XlaShapeFromPyShapeInfo(PyTuple_GetItem(py_dimensions, i)));
+      TF_ASSIGN_OR_RETURN(
+          const Shape& subshape,
+          XlaShapeFromPyShape(PyTuple_GetItem(py_subshapes, i)));
+      subshapes.push_back(subshape);
     }
+    Py_DECREF(py_subshapes);
     return ShapeUtil::MakeTupleShape(subshapes);
   } else {
+    PyObject* py_dimensions;
+    PyObject* py_minor_to_major;
+    TF_ASSIGN_OR_RETURN(py_dimensions, call_method("dimensions"));
+    TF_ASSIGN_OR_RETURN(py_minor_to_major, call_method("minor_to_major"));
+    if (!PyTuple_Check(py_dimensions)) {
+      return error("Return value of Shape method dimensions() is not a tuple");
+    }
+    if (py_minor_to_major != Py_None && !PyTuple_Check(py_minor_to_major)) {
+      return error(
+          "Return value of Shape method minor_to_major() is neither a tuple "
+          "nor None");
+    }
+    const int length = PyTuple_Size(py_dimensions);
+    if (py_minor_to_major != Py_None &&
+        length != PyTuple_Size(py_minor_to_major)) {
+      return error(
+          "Shape methods dimensions() and minor_to_major() return "
+          "different-length tuples");
+    }
     std::vector<int64> dimensions(length);
+    std::vector<int64> minor_to_major(length);
     for (int i = 0; i < length; i++) {
       dimensions[i] = PyIntOrPyLongToLong(PyTuple_GetItem(py_dimensions, i));
-      if (dimensions[i] == -1) {
-        CHECK(!PyErr_Occurred());
+      if (dimensions[i] == -1 && PyErr_Occurred()) {
+        return error("Dimension is not an int");
+      }
+
+      if (py_minor_to_major != Py_None) {
+        minor_to_major[i] =
+            PyIntOrPyLongToLong(PyTuple_GetItem(py_minor_to_major, i));
+        if (minor_to_major[i] == -1 && PyErr_Occurred()) {
+          return error("Minor-to-major value is not an int");
+        }
       }
     }
-    return ShapeUtil::MakeShape(element_type, dimensions);
+    bool with_layout = py_minor_to_major != Py_None;
+    Py_DECREF(py_dimensions);
+    Py_DECREF(py_minor_to_major);
+    if (with_layout) {
+      return ShapeUtil::MakeShapeWithLayout(element_type, dimensions,
+                                            minor_to_major);
+    } else {
+      return ShapeUtil::MakeShape(element_type, dimensions);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 6ff1c34cfc5..adfcc3b8588 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -56,15 +56,11 @@ bool NumpyTypeIsValid(int np_type);
 // The return value is a new reference.
 PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
 
-// Returns the outcome of a best-effort check that the Python object
-// is a pair of the form (numpy dtype, dimensions), as produced by
-// PyShapeInfoFromXlaShape.
-Status CheckPyShapeInfo(PyObject* o);
-
-// Performs the inverse conversion to that of PyShapeInfoFromXlaShape.
+// Converts a Python object with a method interface mathing that of
+// xla_client.Shape into an XLA Shape object.
 //
 // The return value is a new reference.
-Shape XlaShapeFromPyShapeInfo(PyObject* o);
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o);
 
 // Converts a PyObject that represents operation metadata into protocol buffer
 // form.
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 66ace613a0c..2c693f05816 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -89,6 +89,7 @@ _UNARY_OPS = [
     'Abs',
     'Exp',
     'Floor',
+    'Round',
     'Ceil',
     'Log',
     'Sign',
@@ -155,9 +156,14 @@ class LocalBuffer(object):
     self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_py(npval):
+  def from_py(npval, layout_fn=None):
     npval = require_numpy_array_layout(npval)
-    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval))
+    if layout_fn:
+      shape = Shape.from_numpy(npval)
+      shape = shape.map_leaves(layout_fn)
+    else:
+      shape = None
+    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape))
 
   def to_py(self):
     return self.c_local_shaped_buffer.ToLiteral()
@@ -182,13 +188,17 @@ class Shape(object):
   represents an XLA tuple.
   """
 
-  def __init__(self, np_dtype, dimensions):
+  def __init__(self, np_dtype, dimensions, minor_to_major=None):
+    assert isinstance(dimensions, tuple)
     self.np_dtype = np_dtype
     self._dimensions = dimensions
+    self._minor_to_major = minor_to_major
+    self._check_minor_to_major()
 
   def __repr__(self):
-    return 'xla_client.Shape(np_dtype={!r}, dimensions={!r})'.format(
-        self.np_dtype, self._dimensions)
+    return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, '
+            'minor_to_major={!r})').format(self.np_dtype, self._dimensions,
+                                           self._minor_to_major)
 
   def element_type(self):
     return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
@@ -201,11 +211,49 @@ class Shape(object):
       raise ValueError('Tuple shape has no dimensions')
     return self._dimensions
 
+  def minor_to_major(self):
+    return self._minor_to_major
+
   def tuple_shapes(self):
     if not self.is_tuple():
       raise ValueError('Shape is not a tuple shape')
     return self._dimensions
 
+  def rank(self):
+    return len(self.dimensions())
+
+  def map_leaves(self, f):
+    """Map f over each leaf-level array subshape.
+
+    Args:
+      f: The function to apply. Whenever f returns None, the identity is
+        applied instead.
+
+    Returns:
+      A new Shape with the mapped leaves.
+    """
+    if self.is_tuple():
+      children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
+      return Shape(np.dtype('O'), children)
+    else:
+      mapped = f(self)
+      return self if mapped is None else mapped
+
+  def _check_minor_to_major(self):
+    mtm = self._minor_to_major
+    if self.is_tuple():
+      assert mtm is None, self
+    if mtm is not None:
+      assert self.rank() == len(mtm), self
+      assert sorted(mtm) == range(len(mtm)), self
+
+  def update_minor_to_major(self, minor_to_major):
+    if not isinstance(minor_to_major, tuple):
+      raise TypeError('minor_to_major must be a tuple')
+    updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major)
+    updated._check_minor_to_major()  # pylint: disable=protected-access
+    return updated
+
   @staticmethod
   def from_numpy(npval):
 
@@ -222,23 +270,10 @@ def _wrap_shape(shape_info):
   dtype, dims = shape_info
   element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
   if element_type == xla_data_pb2.TUPLE:
-    dims = [_wrap_shape(subshape_info) for subshape_info in dims]
+    dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
   return Shape(dtype, dims)
 
 
-def _unwrap_shape(shape):
-  if shape.is_tuple():
-    components = tuple(
-        _unwrap_shape(subshape) for subshape in shape.tuple_shapes())
-  else:
-    components = shape.dimensions()
-  return (shape.np_dtype, components)
-
-
-def _unwrap_shapes(shapes):
-  return [_unwrap_shape(shape) for shape in shapes]
-
-
 def _wrap_data_handle(handle):
   cdh = xla_data_pb2.ComputationDataHandle()
   cdh.handle = handle
@@ -260,6 +295,17 @@ def require_numpy_array_layout(value):
     return np.require(value, requirements=['C', 'A'])
 
 
+class CompileOptions(object):
+  """Python object for XLA compile options.
+
+  These options can be passed to the 'compile' step when using a local XLA
+  client.
+  """
+
+  def __init__(self):
+    self.generate_hlo_graph = None
+
+
 def transfer_to_infeed(value, replica_number=None):
   """Transfers the given value into the XLA infeed queue.
 
@@ -291,8 +337,7 @@ def transfer_from_outfeed(shape, replica_number=None):
   Returns:
     The literal value that is produced from the outfeed queue.
   """
-  return c_api.TransferFromOutfeedLocalReplica(
-      _unwrap_shape(shape), replica_number or 0)
+  return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
 
 
 class LocalComputation(object):
@@ -313,22 +358,39 @@ class LocalComputation(object):
     else:
       self._delete = c_api.DeleteLocalComputation
 
-  def Compile(self, argument_shapes=()):
+  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
     if self.is_compiled:
       raise ValueError('Attempt to compile a compiled local XLA computation.')
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
     return LocalComputation(
-        self.c_local_computation.Compile(_unwrap_shapes(argument_shapes)),
+        self.c_local_computation.Compile(argument_shapes, compile_options),
         is_compiled=True)
 
-  def CompileWithExampleArguments(self, arguments=()):
+  def CompileWithExampleArguments(self,
+                                  arguments=(),
+                                  compile_options=None,
+                                  layout_fn=None):
     return self.Compile(
-        argument_shapes=[Shape.from_numpy(arg) for arg in arguments])
+        argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
+        compile_options=compile_options,
+        layout_fn=layout_fn)
 
-  def Execute(self, arguments=()):
+  def Execute(self, arguments=(), layout_fn=None):
+    """Execute with Python values as arguments and return value."""
     if not self.is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
+    argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
+    else:
+      argument_shapes = [None for shape in argument_shapes]
     arguments = tuple(map(require_numpy_array_layout, arguments))
-    return self.c_local_computation.Execute(arguments)
+    return self.c_local_computation.Execute(arguments, argument_shapes)
 
   def ExecuteWithLocalBuffers(self, arguments=()):
     """Execute with LocalBuffer arguments and return value."""
@@ -384,7 +446,7 @@ class ComputationBuilder(object):
     Returns:
       A  ComputationDataHandle message.
     """
-    return _wrap_data_handle(self._client.Infeed(_unwrap_shape(shape)))
+    return _wrap_data_handle(self._client.Infeed(shape))
 
   def Outfeed(self, operand):
     """Enqueues an outfeed op onto the computation.
@@ -393,7 +455,7 @@ class ComputationBuilder(object):
     outfeed queue for subsequent dequeue via the client API.
     """
     self._client.Outfeed(
-        _unwrap_data_handle(operand), _unwrap_shape(self.GetShape(operand)),
+        _unwrap_data_handle(operand), self.GetShape(operand),
         ''.encode('utf-8'))
 
   def Constant(self, value):
@@ -484,8 +546,7 @@ class ComputationBuilder(object):
       parameter_num = next(self._parameter_numbering)
 
     return _wrap_data_handle(
-        self._client.Parameter(
-            parameter_num, _unwrap_shape(shape), name.encode('utf8')))
+        self._client.Parameter(parameter_num, shape, name.encode('utf8')))
 
   def ParameterFromNumpy(self, value, name=None, parameter_num=None):
     """Enqueues a Parameter op onto the computation.
@@ -606,6 +667,13 @@ class ComputationBuilder(object):
     return _wrap_data_handle(
         self._client.Rev(_unwrap_data_handle(operand), dimensions))
 
+  def Clamp(self, min, operand, max):  # pylint: disable=redefined-builtin
+    """Clamp op."""
+    return _wrap_data_handle(
+        self._client.Clamp(_unwrap_data_handle(min),
+                           _unwrap_data_handle(operand),
+                           _unwrap_data_handle(max)))
+
   def SelectAndScatter(self, operand, select, window_dimensions, window_strides,
                        padding, source, init_value, scatter):
     """Select and scatter op, used by the gradient of ReduceWindow.
@@ -825,8 +893,7 @@ class ComputationBuilder(object):
     shape = Shape(self.GetShape(mu).np_dtype, dims)
     return _wrap_data_handle(
         self._client.RngNormal(
-            _unwrap_data_handle(mu), _unwrap_data_handle(sigma),
-            _unwrap_shape(shape)))
+            _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
 
   def RngUniform(self, a, b, dims):
     """Enqueues an RngUniform operation onto the computation.
@@ -846,8 +913,7 @@ class ComputationBuilder(object):
     shape = Shape(self.GetShape(a).np_dtype, dims)
     return _wrap_data_handle(
         self._client.RngUniform(
-            _unwrap_data_handle(a), _unwrap_data_handle(b),
-            _unwrap_shape(shape)))
+            _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
 
   def While(self, cond, body, init):
     """Enqueues a While operation onto the computation.
@@ -865,10 +931,37 @@ class ComputationBuilder(object):
                            _unwrap_data_handle(init)))
 
   def Dot(self, lhs, rhs):
-    """Matrix multiplication between lhs and rhs."""
+    """Enqueues a dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the rank 1 or rank 2 left-hand-side array.
+      rhs: ComputationDataHandle for the rank 1 or rank 2 right-hand-side array.
+
+    Returns: a ComputationDataHandle representing the Dot operation.
+    """
     return _wrap_data_handle(
         self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs)))
 
+  def DotGeneral(self, lhs, rhs, dimension_numbers):
+    """Enqueues a general dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the left-hand-side array.
+      rhs: ComputationDataHandle for the right-hand-side array.
+      dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested
+        tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
+        integers representing the dimensions to treat as contracting dimensions
+        and batch dimensions on each input operand.
+
+    Returns: a ComputationDataHandle representing the DotGeneral operation.
+    """
+    if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers):
+      dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
+    return _wrap_data_handle(
+        self._client.DotGeneral(
+            _unwrap_data_handle(lhs), _unwrap_data_handle(rhs),
+            dimension_numbers))
+
   def Conv(self, lhs, rhs, window_strides, padding):
     """Enqueues a Conv operation onto the computation.
 
@@ -979,7 +1072,7 @@ def initialize_replica_count(replica_count):
 
   Args:
     replica_count: number of replicas that are desired for set up during XLA
-      initalization.
+      initialization.
 
   Raises:
     A runtime exception if the XLA service has already been initialized.
@@ -1005,3 +1098,13 @@ def GetPaddingConfigFromTriples(triples):
     dimension.edge_padding_high = hi
     dimension.interior_padding = interior
   return padding_config
+
+
+def GetDotDimensionsFromLists(dimension_numbers):
+  (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers
+  dot_dims_proto = xla_data_pb2.DotDimensionNumbers()
+  dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract)
+  dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract)
+  dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch)
+  dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch)
+  return dot_dims_proto
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index c0413b9bbc3..421fba40e32 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -444,6 +444,30 @@ class SingleOpTest(LocalComputationTest):
     c.Dot(c.Constant(lhs), c.Constant(rhs))
     self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
 
+  def testDotGeneral(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+    dimension_numbers = (([2], [1]), ([0], [0]))
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
+  def testDotGeneralWithDotDimensionNumbersProto(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+
+    dimension_numbers = xla_client.xla_data_pb2.DotDimensionNumbers()
+    dimension_numbers.lhs_contracting_dimensions.append(2)
+    dimension_numbers.rhs_contracting_dimensions.append(1)
+    dimension_numbers.lhs_batch_dimensions.append(0)
+    dimension_numbers.rhs_batch_dimensions.append(0)
+
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
   def testConvF32Same(self):
     c = self._NewComputation()
     a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
@@ -496,6 +520,12 @@ class SingleOpTest(LocalComputationTest):
     c.Exp(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=np.exp(arr))
 
+  def testRound(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Round(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.round(arr))
+
   def testLog(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
@@ -699,6 +729,23 @@ class SingleOpTest(LocalComputationTest):
     self._ExecuteAndCompareExact(
         c, expected=[[[6, 5], [8, 7]], [[2, 1], [4, 3]]])
 
+  def testClampF32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayF32(-1)),
+        c.Constant(NumpyArrayF32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayF32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, -1, 0, 1, 2, 2])
+
+  # TODO(b/72689392): re-enable when bug S32 resolved
+  def DISABLED_testClampS32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayS32(-1)),
+        c.Constant(NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayS32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, 0, 1, 2, 2])
+
   def testSelect(self):
     c = self._NewComputation()
     c.Select(
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 469acc330cb..0f2d0a9e96e 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -509,6 +509,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -1110,8 +1111,6 @@ cc_library(
         ":hlo",
         ":hlo_evaluator",
         ":hlo_pass",
-        ":tuple_util",
-        ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
     ],
@@ -1156,6 +1155,34 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "implicit_broadcast_remover",
+    srcs = ["implicit_broadcast_remover.cc"],
+    hdrs = ["implicit_broadcast_remover.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "implicit_broadcast_remover_test",
+    srcs = ["implicit_broadcast_remover_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+    ],
+)
+
 cc_library(
     name = "dot_decomposer",
     srcs = ["dot_decomposer.cc"],
@@ -1825,7 +1852,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -1856,6 +1885,7 @@ cc_library(
         ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
+        ":hlo_proto_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index ba82e822b21..fb857559f97 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1618,9 +1618,12 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
         reduce,
         HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
   }
+
   // A Transpose feeding a reduce can simply permute the reduction dimensions
-  // field.
-  if (arg->opcode() == HloOpcode::kTranspose) {
+  // field if the output of the reduce is a vector or scalar. Higher ranked
+  // result may require a transpose of the output.
+  if (ShapeUtil::Rank(reduce->shape()) <= 1 &&
+      arg->opcode() == HloOpcode::kTranspose) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64> new_reduce_dimensions;
     for (auto dim : dimensions) {
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index d5594dc07c8..774b11478c6 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -997,14 +997,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       auto color = single_colored_set.first;
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
+      HeapSimulator::Options options;
+      options.buffers_to_assign = &single_colored_set.second;
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
           HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                  MakeUnique<LazyBestFitHeap>(alignment)),
                              assignment->module(), module_sequence,
                              assignment->points_to_analysis(),
-                             assignment->buffer_size_,
-                             &single_colored_set.second));
+                             assignment->buffer_size_, options));
       AssignBuffersFromHeapSimulator(result, assignment,
                                      single_colored_set.first);
     }
@@ -1024,14 +1025,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         auto color = single_colored_set.first;
         VLOG(2) << "Simulating heap for color " << color;
         int64 alignment = assignment->color_alignment_(color);
+        HeapSimulator::Options options;
+        options.buffers_to_assign = &single_colored_set.second;
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                    MakeUnique<LazyBestFitHeap>(alignment)),
                                *computation, *instruction_sequence,
                                assignment->points_to_analysis(),
-                               assignment->buffer_size_,
-                               &single_colored_set.second));
+                               assignment->buffer_size_, options));
         AssignBuffersFromHeapSimulator(result, assignment,
                                        single_colored_set.first);
       }
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index fc67330f5cb..74fd24edf88 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -72,8 +72,18 @@ class AotCompilationOptions {
   // Returns the ID of the platform to which these options apply.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
+  // Optional allocator that may be used for allocating temp space on the device
+  // during compilation.
+  DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
+  void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+    device_allocator_ = device_allocator;
+  }
+
  protected:
   AotCompilationOptions() = default;
+
+ private:
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
 };
 
 // Abstract compiler interface that is subclassed for compilation on a
@@ -99,9 +109,16 @@ class Compiler {
 
   // Runs Hlo passes to optimize the given Hlo module, returns the optimized
   // module.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // space on the device for use during compilation.  For example, the compiler
+  // may allocate buffers on the device and then run variants of a given
+  // algorithm over those buffers, to see which variant is fastest.  Any space
+  // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
@@ -112,21 +129,27 @@ class Compiler {
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // TODO(b/68666782): Remove this method after adding support for multiple
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) = 0;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 33af77e1a81..d13a97bcc9a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -437,7 +437,8 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/) {
+    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
 
@@ -450,7 +451,8 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec) {
+    perftools::gputools::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
   XLA_SCOPED_LOGGING_TIMER(timer_message);
@@ -517,8 +519,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // ownership is std::moved.
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
   if (options::CpuParallelBackendRequested(module->config())) {
     VLOG(1) << "Using parallel cpu backend";
@@ -538,10 +540,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     // If we are using the parallel CPU backend, we need to create map from
@@ -647,10 +649,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     // Each computation is a single function.  Emit all embedded computations
@@ -826,12 +828,12 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    const string xla_dump_hlo_proto_to =
-        module->config().debug_options().xla_dump_hlo_proto_to();
-    if (!xla_dump_hlo_proto_to.empty()) {
+    const string xla_dump_optimized_hlo_proto_to =
+        module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index ebed7058d8f..3498139ab95 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -118,11 +118,13 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 71e81331897..0b2d3d47463 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -479,7 +479,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
 
 Status IrEmitter::HandleSort(HloInstruction* sort) {
   // TODO(b/26783907): Implement sort on CPU.
-  return Unimplemented("Sort is not supported on CPU (b/26783907).");
+  return Unimplemented("Sort is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleTuple(HloInstruction* tuple) {
@@ -522,7 +522,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for reduce-window not implemented on CPU. See b/31410564.");
+        "Dilation for ReduceWindow is not implemented on CPU.");
   }
 
   // The called computation should have been emitted previously.
@@ -625,8 +625,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // TODO(b/31410564): Implement dilation for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on CPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter is not implemented on CPU. ");
   }
 
   // The select and scatter computations should have been emitted previously.
@@ -1196,8 +1195,7 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   }
 
   // TODO(b/33011107): Support cross replica sum on CPU.
-  return Unimplemented(
-      "Cross replica sum is not implemented on CPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on CPU.");
 }
 
 // Fills up the free variables in 'index_with_free_var' with values from
@@ -1811,12 +1809,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 
 Status IrEmitter::HandleSend(HloInstruction* send) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSlice(HloInstruction* slice) {
@@ -1981,12 +1979,12 @@ Status IrEmitter::HandleDynamicUpdateSlice(
 
 Status IrEmitter::HandleRecv(HloInstruction* recv) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandlePad(HloInstruction* pad) {
@@ -1995,10 +1993,10 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   for (auto& padding_dimension : pad->padding_config().dimensions()) {
     if (padding_dimension.edge_padding_low() < 0 ||
         padding_dimension.edge_padding_high() < 0) {
-      return Unimplemented(
-          "Negative padding not supported in the CPU backend (b/34628603); "
-          "this should have been eliminated at the HLO level: %s",
-          pad->padding_config().ShortDebugString().c_str());
+      return InternalErrorStrCat(
+          "Encountered negative padding in IrEmitter on CPU. "
+          "This should have been eliminated at the HLO level. ",
+          pad->ToString());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 2e4b0a52305..78e7aa48acc 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    perftools::gputools::Platform* platform,
+    const perftools::gputools::Platform* platform,
     tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
         stream_executors)
     : DeviceMemoryAllocator(platform),
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 00caefab667..39dfad84c1c 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -49,14 +49,14 @@ class DeviceMemoryAllocator {
       int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  const perftools::gputools::Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
-  perftools::gputools::Platform* platform_;
+  const perftools::gputools::Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -64,7 +64,7 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      perftools::gputools::Platform* platform,
+      const perftools::gputools::Platform* platform,
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           stream_executors);
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 9780bac16ec..4468adbadbf 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -428,7 +428,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           llvm::Intrinsic::round, {operand_value}, {operand_value->getType()},
           ir_builder_);
     case HloOpcode::kSign: {
-      // TODO(b/32151903): Ensure consistent sign behavior for -0.0
+      // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero);
@@ -870,7 +870,10 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) const {
   if (prim_type != F32) {
-    return Unimplemented("inverse erf only implemented for F32 (b/34339814)");
+    // TODO(b/34339814): Implement inverse erf for F64.
+    return Unimplemented(
+        "Inverse erf is only implemented for element "
+        "type F32.");
   }
   auto getFloat = [&](const float f) {
     return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f);
@@ -1040,17 +1043,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
           is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
           lhs_value, rhs_value, ir_builder_);
     case HloOpcode::kMinimum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMin(lhs_value, rhs_value, is_signed);
     case HloOpcode::kMaximum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMax(lhs_value, rhs_value, is_signed);
     case HloOpcode::kAnd:
       return ir_builder_->CreateAnd(lhs_value, rhs_value);
     case HloOpcode::kOr:
@@ -1067,6 +1062,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
   }
 }
 
+llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
+llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
     const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo,
     int64 operand_no) const {
@@ -1363,7 +1378,18 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
                             operand_to_generator.at(hlo->operand(2))(
                                 ElementwiseSourceIndex(index, *hlo, 2)));
-        return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        PrimitiveType prim_type = hlo->shape().element_type();
+        if (primitive_util::IsFloatingPointType(prim_type)) {
+          return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        } else if (primitive_util::IsIntegralType(prim_type)) {
+          bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
+          return EmitIntegralMin(
+              max_value, EmitIntegralMax(min_value, arg_value, is_signed),
+              is_signed);
+        } else {
+          return Unimplemented("Clamp unimplemented for %s",
+                               PrimitiveType_Name(prim_type).c_str());
+        }
       };
     case HloOpcode::kReducePrecision:
       return [this, hlo, &operand_to_generator](
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 1a48eb5fcb9..c516a826d9e 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -86,6 +86,12 @@ class ElementalIrEmitter {
   virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value,
                                     llvm::Value* rhs_value) const;
 
+  llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
+  llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
   virtual StatusOr<llvm::Value*> EmitErfInv(PrimitiveType prim_type,
                                             llvm::Value* value) const;
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 3c3328b9cd2..7df01f7edd1 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -131,6 +131,7 @@ cc_library(
         "ir_emitter_context.h",
     ],
     deps = [
+        ":cudnn_convolution_runner",
         ":elemental_ir_emitter",
         ":gpu_constants",
         ":gpu_executable",
@@ -262,6 +263,7 @@ cc_library(
     ],
     deps = [
         ":buffer_allocations",
+        ":cudnn_convolution_runner",
         ":infeed_manager",
         ":ir_emission_utils",
         ":partition_assignment",
@@ -309,9 +311,41 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_folding",
-    srcs = ["convolution_folding.cc"],
-    hdrs = ["convolution_folding.h"],
+    name = "cudnn_convolution_algorithm_picker",
+    srcs = ["cudnn_convolution_algorithm_picker.cc"],
+    hdrs = ["cudnn_convolution_algorithm_picker.h"],
+    deps = [
+        ":cudnn_convolution_runner",
+        ":gpu_executable",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "cudnn_convolution_runner",
+    srcs = ["cudnn_convolution_runner.cc"],
+    hdrs = ["cudnn_convolution_runner.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "cudnn_convolution_rewriter",
+    srcs = ["cudnn_convolution_rewriter.cc"],
+    hdrs = ["cudnn_convolution_rewriter.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
@@ -325,15 +359,18 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "convolution_folding_test",
-    srcs = ["convolution_folding_test.cc"],
+    name = "cudnn_convolution_rewriter_test",
+    srcs = ["cudnn_convolution_rewriter_test.cc"],
     deps = [
-        ":convolution_folding",
+        ":cudnn_convolution_rewriter",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:test",
     ],
 )
@@ -446,7 +483,8 @@ cc_library(
     srcs = ["gpu_compiler.cc"],
     hdrs = ["gpu_compiler.h"],
     deps = [
-        ":convolution_folding",
+        ":cudnn_convolution_algorithm_picker",
+        ":cudnn_convolution_rewriter",
         ":fusion_merger",
         ":gpu_constants",
         ":gpu_copy_insertion",
@@ -514,7 +552,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "@llvm//:core",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 899cc5c83b9..f76f15929d1 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -36,366 +37,69 @@ using se::dnn::DataLayout;
 using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
 
-ConvolveScratchAllocator::ConvolveScratchAllocator(
-    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
-    : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
-
-ConvolveScratchAllocator::~ConvolveScratchAllocator() {
-  for (auto& allocated_buffer : allocated_buffers_) {
-    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
-             .ok()) {
-      // The program can still continue with failed deallocation.
-      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
-                 << allocated_buffer.opaque();
-    }
-  }
-}
-
-int64 ConvolveScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
-  constexpr int64 kConvolveScratchSize = 1LL << 32;  // 4GB by default.
-  return kConvolveScratchSize;
-}
-
-se::port::StatusOr<se::DeviceMemory<uint8>>
-ConvolveScratchAllocator::AllocateBytes(se::Stream* stream, int64 byte_size) {
-  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
-    return se::port::Status(
-        se::port::error::RESOURCE_EXHAUSTED,
-        tensorflow::strings::Printf(
-            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
-  }
-
-  auto status_or_memory =
-      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                  /*retry_on_failure=*/false);
-  if (!status_or_memory.ok()) {
-    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
-                            tensorflow::strings::Printf(
-                                "Failed to allocate %lld bytes on device %d.",
-                                byte_size, device_ordinal_));
-  }
-  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
-  allocated_buffers_.push_back(allocated_buffer);
-  total_allocated_bytes_ += byte_size;
-  return se::DeviceMemory<uint8>(allocated_buffer);
-}
-
-string ConvolutionKindToString(
-    ConvolutionThunk::ConvolutionKind convolution_kind) {
-  switch (convolution_kind) {
-    case ConvolutionThunk::ConvolutionKind::kForward:
-      return "forward";
-    case ConvolutionThunk::ConvolutionKind::kBackwardFilter:
-      return "backward_filter";
-    case ConvolutionThunk::ConvolutionKind::kBackwardInput:
-      return "backward_input";
-  }
-  return "unknown convolution kind";
-}
-
 ConvolutionThunk::ConvolutionThunk(
-    ConvolutionKind convolution_kind,
-    const BufferAllocation::Slice& input_buffer,
+    CudnnConvKind convolution_kind, const BufferAllocation::Slice& input_buffer,
     const BufferAllocation::Slice& filter_buffer,
-    const BufferAllocation::Slice& output_buffer, const Shape& input_shape,
+    const BufferAllocation::Slice& output_buffer,
+    const BufferAllocation::Slice& tuple_result_buffer,
+    const BufferAllocation::Slice& scratch_buffer, const Shape& input_shape,
     const Shape& filter_shape, const Shape& output_shape, const Window& window,
-    const ConvolutionDimensionNumbers& dim_nums, const HloInstruction* hlo)
+    const ConvolutionDimensionNumbers& dim_nums, int64 algorithm,
+    const HloInstruction* hlo)
     : Thunk(Kind::kConvolution, hlo),
       convolution_kind_(convolution_kind),
       input_buffer_(input_buffer),
       filter_buffer_(filter_buffer),
       output_buffer_(output_buffer),
+      tuple_result_buffer_(tuple_result_buffer),
+      scratch_buffer_(scratch_buffer),
       input_shape_(input_shape),
       filter_shape_(filter_shape),
       output_shape_(output_shape),
       window_(window),
-      dim_nums_(dim_nums) {}
+      dim_nums_(dim_nums),
+      algorithm_(algorithm) {}
 
-tensorflow::Status ConvolutionThunk::ExecuteOnStream(
+Status ConvolutionThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
-  VLOG(3) << "Convolution kind: " << ConvolutionKindToString(convolution_kind_);
-  VLOG(3) << "input shape: { " << input_shape_.ShortDebugString() << " }";
-  VLOG(3) << "filter shape: { " << filter_shape_.ShortDebugString() << " }";
-  VLOG(3) << "Output shape: { " << output_shape_.ShortDebugString() << " }";
-  VLOG(3) << "Dim nums: { " << dim_nums_.ShortDebugString() << " }";
-  VLOG(3) << "Window: { " << window_.ShortDebugString() << " }";
-
-  const int num_dimensions = window_.dimensions_size();
-  CHECK_LE(num_dimensions, 3);
-  // cuDNN does not support 1D convolutions. We therefore express 1D
-  // convolutions as 2D convolutions where the first spatial dimension is 1.
-  // This matches the behavior of TF (see definition of conv1d in
-  // tensorflow/python/ops/nn_ops.py).
-  const int effective_num_dimensions = std::max(2, num_dimensions);
-
-  CHECK_EQ(F32, output_shape_.element_type());
-  CHECK_EQ(num_dimensions, dim_nums_.input_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dim_nums_.kernel_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dim_nums_.output_spatial_dimensions_size());
-  for (const WindowDimension& dim : window_.dimensions()) {
-    CHECK_EQ(dim.padding_low(), dim.padding_high());
-  }
-
-  // cuDNN's convolution APIs support the BDYX layout for activations/output and
-  // the OIYX layout for weights.
-  BatchDescriptor input_descriptor(effective_num_dimensions);
-  input_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_feature_map_count(
-          input_shape_.dimensions(dim_nums_.input_feature_dimension()))
-      .set_count(input_shape_.dimensions(dim_nums_.input_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    // Note that the dimensions are reversed. The same holds below.
-    input_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        input_shape_.dimensions(dim_nums_.input_spatial_dimensions(dim)));
-  }
-
-  FilterDescriptor filter_descriptor(effective_num_dimensions);
-  filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
-      .set_input_feature_map_count(
-          filter_shape_.dimensions(dim_nums_.kernel_input_feature_dimension()))
-      .set_output_feature_map_count(filter_shape_.dimensions(
-          dim_nums_.kernel_output_feature_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    filter_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(dim)));
-  }
-
-  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    convolution_descriptor
-        .set_zero_padding(
-            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-            window_.dimensions(dim).padding_low())
-        .set_filter_stride(
-            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-            window_.dimensions(dim).stride());
-  }
-
-  BatchDescriptor output_descriptor(effective_num_dimensions);
-  output_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_feature_map_count(
-          output_shape_.dimensions(dim_nums_.output_feature_dimension()))
-      .set_count(output_shape_.dimensions(dim_nums_.output_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    output_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        output_shape_.dimensions(dim_nums_.output_spatial_dimensions(dim)));
-  }
-
-  // Add a singleton dimension in the 1D convolution case.
-  if (num_dimensions == 1) {
-    input_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    output_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    filter_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    convolution_descriptor
-        .set_zero_padding(static_cast<se::dnn::DimIndex>(0), 0)
-        .set_filter_stride(static_cast<se::dnn::DimIndex>(0), 1);
-  }
-
   se::DeviceMemory<float> input_data(
       buffer_allocations.GetDeviceAddress(input_buffer_));
   se::DeviceMemory<float> filter_data(
       buffer_allocations.GetDeviceAddress(filter_buffer_));
   se::DeviceMemory<float> output_data(
       buffer_allocations.GetDeviceAddress(output_buffer_));
-  return ConvolveWithTune(input_descriptor, input_data, filter_descriptor,
-                          filter_data, output_descriptor, output_data,
-                          convolution_descriptor, buffer_allocations, stream);
-}
+  se::DeviceMemoryBase scratch =
+      buffer_allocations.GetDeviceAddress(scratch_buffer_);
 
-tensorflow::Status ConvolutionThunk::Convolve(
-    const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
-    const FilterDescriptor& filter_descriptor,
-    se::DeviceMemory<float> filter_data,
-    const BatchDescriptor& output_descriptor,
-    se::DeviceMemory<float> output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const se::dnn::AlgorithmConfig& algorithm_config, se::Stream* stream,
-    ConvolveScratchAllocator* scratch_allocator,
-    se::dnn::ProfileResult* profile_result) {
-  bool launch_ok;
-  switch (convolution_kind_) {
-    case ConvolutionKind::kBackwardFilter:
-      launch_ok =
-          stream
-              ->ThenConvolveBackwardFilterWithAlgorithm(
-                  input_descriptor, input_data, output_descriptor, output_data,
-                  convolution_descriptor, filter_descriptor, &filter_data,
-                  scratch_allocator, algorithm_config, profile_result)
-              .ok();
-      break;
-    case ConvolutionKind::kBackwardInput:
-      launch_ok = stream
-                      ->ThenConvolveBackwardDataWithAlgorithm(
-                          filter_descriptor, filter_data, output_descriptor,
-                          output_data, convolution_descriptor, input_descriptor,
-                          &input_data, scratch_allocator, algorithm_config,
-                          profile_result)
-                      .ok();
-      break;
-    case ConvolutionKind::kForward:
-      launch_ok =
-          stream
-              ->ThenConvolveWithAlgorithm(
-                  input_descriptor, input_data, filter_descriptor, filter_data,
-                  convolution_descriptor, output_descriptor, &output_data,
-                  scratch_allocator, algorithm_config, profile_result)
-              .ok();
-      break;
-  }
-  if (launch_ok) {
-    return tensorflow::Status::OK();
-  }
-  return InternalError(
-      "Unable to launch convolution for thunk %p with type %s and algorithm "
-      "(%lld, %lld)",
-      this, ConvolutionKindToString(convolution_kind_).c_str(),
-      algorithm_config.algorithm().algo_id(),
-      algorithm_config.algorithm_no_scratch().algo_id());
-}
+  se::dnn::AlgorithmConfig algorithm_config(
+      se::dnn::AlgorithmDesc(algorithm_, /*use_tensor_ops=*/false));
 
-std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
-    bool with_winograd_nonfused, se::StreamExecutor* stream_exec) const {
-  std::vector<AlgorithmDesc> algorithms;
-  switch (convolution_kind_) {
-    case ConvolutionKind::kBackwardFilter:
-      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
-          with_winograd_nonfused, &algorithms));
-      break;
-    case ConvolutionKind::kBackwardInput:
-      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
-          with_winograd_nonfused, &algorithms));
-      break;
-    case ConvolutionKind::kForward:
-      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
-                                               &algorithms));
-      break;
-  }
-  return algorithms;
-}
+  TF_RETURN_IF_ERROR(RunCudnnConvolution(
+      convolution_kind_, input_shape_, filter_shape_, output_shape_, input_data,
+      filter_data, output_data, scratch, window_, dim_nums_, algorithm_config,
+      stream));
 
-static string AlgorithmToString(const se::dnn::AlgorithmDesc& algo) {
-  if (algo.tensor_ops_enabled()) {
-    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
-  }
-  return tensorflow::strings::StrCat(algo.algo_id());
-}
-
-// Determines whether we can safely perform a winograd non-fused convolution for
-// the given input and output descriptors.  This works around b/68264959, an
-// integer overflow in cuDNNv5 and cuDNNv6.
-static bool ShouldIncludeWinogradNonfusedAlgo(
-    const BatchDescriptor& input_descriptor,
-    const BatchDescriptor& output_descriptor) {
-  int64 batch = input_descriptor.count();
-  int64 in_depths = input_descriptor.feature_map_count();
-  int64 in_rows = input_descriptor.height();
-  int64 in_cols = input_descriptor.width();
-  int64 out_depths = output_descriptor.feature_map_count();
-
-  int64 total_size = 16 * std::ceil(batch / 16.0) *
-                     std::max(in_depths, out_depths) * in_cols * in_rows *
-                     sizeof(float);
-  int64 threshold = 1L << 31;
-
-  return total_size < threshold;
-}
-
-tensorflow::Status ConvolutionThunk::ConvolveWithTune(
-    const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
-    const FilterDescriptor& filter_descriptor,
-    se::DeviceMemory<float> filter_data,
-    const BatchDescriptor& output_descriptor,
-    se::DeviceMemory<float> output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
-  // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  if (!best_algorithm_.has_value()) {
-    best_algorithm_.emplace();
-
-    // Auto-tuning either is disabled or only happens in the first run of this
-    // function.
-    VLOG(2) << "Profiling for best convolution algorithm used for "
-               "ConvolutionThunk: "
-            << this;
-
-    bool with_winograd_nonfused =
-        ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor);
-
-    se::dnn::ProfileResult best_result;
-    se::dnn::ProfileResult best_result_without_scratch;
-    std::vector<AlgorithmDesc> algorithms =
-        GetAlgorithms(with_winograd_nonfused, stream->parent());
-    for (auto algorithm : algorithms) {
-      ConvolveScratchAllocator scratch_allocator(
-          buffer_allocations.device_ordinal(),
-          buffer_allocations.memory_allocator());
-      se::dnn::ProfileResult profile_result;
-      VLOG(3) << "Trying algorithm " << AlgorithmToString(algorithm)
-              << " for ConvolutionThunk: " << this;
-      bool launch_ok =
-          Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
-                   output_descriptor, output_data, convolution_descriptor,
-                   se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
-                   &scratch_allocator, &profile_result)
-              .ok();
-      if (launch_ok && profile_result.is_valid()) {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " succeeded, taking "
-                << profile_result.elapsed_time_in_ms()
-                << "ms. (Best result: " << best_result.elapsed_time_in_ms()
-                << "ms)";
-        if (profile_result.elapsed_time_in_ms() <
-            best_result.elapsed_time_in_ms()) {
-          best_result = profile_result;
-        }
-        if (scratch_allocator.TotalAllocatedBytes() == 0 &&
-            profile_result.elapsed_time_in_ms() <
-                best_result_without_scratch.elapsed_time_in_ms()) {
-          best_result_without_scratch = profile_result;
-        }
-      } else {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " failed.";
-      }
+  // Figure out which of output/input/filter is the result produced by this op,
+  // and write the result tuple.
+  void* result_ptr = [&] {
+    switch (convolution_kind_) {
+      case CudnnConvKind::kForward:
+        return output_data.opaque();
+      case CudnnConvKind::kBackwardInput:
+        return input_data.opaque();
+      case CudnnConvKind::kBackwardFilter:
+        return filter_data.opaque();
     }
+  }();
+  void* ptrs[] = {result_ptr, scratch.opaque()};
+  se::DeviceMemory<void*> tuple_addr(
+      buffer_allocations.GetDeviceAddress(tuple_result_buffer_));
+  stream->ThenMemcpyH2D<void*>(ptrs, &tuple_addr);
 
-    if (best_result.is_valid()) {
-      best_algorithm_->set_algorithm(best_result.algorithm());
-    } else {
-      LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
-                    "to the default algorithm.";
-      best_algorithm_->set_algorithm(AlgorithmDesc());
-    }
-
-    if (best_result_without_scratch.is_valid()) {
-      best_algorithm_->set_algorithm_no_scratch(
-          best_result_without_scratch.algorithm());
-    } else {
-      LOG(ERROR) << "No convolution algorithm without scratch works with "
-                    "profiling. Fall back "
-                    "to the default algorithm.";
-      best_algorithm_->set_algorithm_no_scratch(AlgorithmDesc());
-    }
-  }
-
-  {
-    VLOG(2) << "Using convolution algorithm ("
-            << AlgorithmToString(best_algorithm_->algorithm()) << ", "
-            << AlgorithmToString(best_algorithm_->algorithm_no_scratch())
-            << ") for ConvolutionThunk: " << this;
-    ConvolveScratchAllocator scratch_allocator(
-        buffer_allocations.device_ordinal(),
-        buffer_allocations.memory_allocator());
-    return Convolve(input_descriptor, input_data, filter_descriptor,
-                    filter_data, output_descriptor, output_data,
-                    convolution_descriptor, *best_algorithm_, stream,
-                    &scratch_allocator, nullptr);
+  if (!stream->ok()) {
+    return InternalError("ConvolutionThunk::ExecuteOnStream failed.");
   }
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 46c94d0bf1e..ca9ef5277b3 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -30,106 +31,47 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// A one-time scratch allocator for forward and backward convolution. The
-// scratch buffers allocated are released on destruction.
-//
-// Not thread-safe.
-class ConvolveScratchAllocator : public perftools::gputools::ScratchAllocator {
- public:
-  ConvolveScratchAllocator(int device_ordinal,
-                           DeviceMemoryAllocator* memory_allocator);
-
-  ~ConvolveScratchAllocator() override;
-
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
-
-  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
-
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
-
- private:
-  const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
-  int64 total_allocated_bytes_ = 0;
-};
-
 // This class stores everything that StreamExecutor needs to launch a BNN
 // convolution. It is generated by IrEmitter.
 //
 // This is thread-compatible.
 class ConvolutionThunk : public Thunk {
  public:
-  // ConvolutionThunk performs one of the following types of convolution.
-  enum class ConvolutionKind {
-    kBackwardFilter,  // Backward convolution for filter.
-    kBackwardInput,   // Backward convolution for input.
-    kForward,         // Forward convolution.
-  };
-
-  // Constructs a thunk for launching a DNN convolution.
+  // Constructs a thunk for launching a DNN convolution.  When run, it will
+  // write a tuple (result, scratch_memory) into `tuple_result_buffer`.
+  //
+  // `algorithm` is a cudnn algorithm number.  `algorithm == -1` indicates that
+  // we should use the default (i.e. baseline) cudnn algorithm.
+  //
+  // Note that "output" here doesn't refer to the output from running this
+  // thunk, but rather to the "output" of a hypothetical forward convolution
+  // that corresponds to this input+filter+output triple.  That is, the result
+  // generated by this thunk is "output" for forward convs, "input" for
+  // backward-input convs, and "filter" for backward-filter convs.
+  //
   // Semantics of null hlo_instruction argument are as in Thunk.
-  ConvolutionThunk(ConvolutionKind convolution_kind,
+  ConvolutionThunk(CudnnConvKind convolution_kind,
                    const BufferAllocation::Slice& input_buffer,
                    const BufferAllocation::Slice& filter_buffer,
                    const BufferAllocation::Slice& output_buffer,
+                   const BufferAllocation::Slice& tuple_result_buffer,
+                   const BufferAllocation::Slice& scratch_buffer,
                    const Shape& input_shape, const Shape& filter_shape,
                    const Shape& output_shape, const Window& window,
-                   const ConvolutionDimensionNumbers& dnums,
+                   const ConvolutionDimensionNumbers& dim_nums, int64 algorithm,
                    const HloInstruction* hlo);
 
   ConvolutionThunk(const ConvolutionThunk&) = delete;
   ConvolutionThunk& operator=(const ConvolutionThunk&) = delete;
 
-  // Does the convolution for the thunk on "stream". Auto-tuning happens on the
-  // first run of this function.
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
-
-  // Returns true if the next run of ExecuteOnStream will do autotuning.  If so,
-  // we want the GPU to be quiescent during autotuning, so as not to introduce
-  // noise in our results.
-  bool ShouldHaltAllActivityBeforeRunning(
-      perftools::gputools::Stream*) override {
-    return !best_algorithm_.has_value();
-  }
-
-  // Return true if scratch memory is needed to execute the thunk, that is
-  // either the best algorithm hasn't been chosen or the best algorithm is not
-  // the same as the no-scratch algorithm. This is because that the execution
-  // of the thunk is asynchronous, and the scratch allocator goes out of
-  // scope before the thunk finishes execution. Returning true tells the stream
-  // executor to make future thunks wait for this thunk to avoid reusing the
-  // deallocated scratch memory until this thunk is done with it.
-  bool ShouldBlockFutureThunks() {
-    if (!best_algorithm_.has_value()) {
-      return true;
-    }
-
-    const perftools::gputools::dnn::AlgorithmDesc& best_alg =
-        best_algorithm_->algorithm();
-    const perftools::gputools::dnn::AlgorithmDesc& no_scratch_best_alg =
-        best_algorithm_->algorithm_no_scratch();
-    return (!best_alg.is_default() || !no_scratch_best_alg.is_default() ||
-            !(best_alg == no_scratch_best_alg));
-  }
+  // Does the convolution for the thunk on "stream".
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
 
  private:
-  tensorflow::Status ConvolveWithTune(
-      const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
-      perftools::gputools::DeviceMemory<float> input_data,
-      const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
-      perftools::gputools::DeviceMemory<float> filter_data,
-      const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
-      perftools::gputools::DeviceMemory<float> output_data,
-      const perftools::gputools::dnn::ConvolutionDescriptor&
-          convolution_descriptor,
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream);
+  class ScratchAllocator;
 
-  tensorflow::Status Convolve(
+  Status Convolve(
       const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
       perftools::gputools::DeviceMemory<float> input_data,
       const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
@@ -139,40 +81,26 @@ class ConvolutionThunk : public Thunk {
       const perftools::gputools::dnn::ConvolutionDescriptor&
           convolution_descriptor,
       const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
-      perftools::gputools::Stream* stream,
-      ConvolveScratchAllocator* scratch_allocator,
+      perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
       perftools::gputools::dnn::ProfileResult* profile_result);
 
-  // Returns the convolve algorithms that can be used for this ConvolutionThunk.
-  std::vector<perftools::gputools::dnn::AlgorithmDesc> GetAlgorithms(
-      bool with_winograd_nonfused,
-      perftools::gputools::StreamExecutor* stream_exec) const;
-
-  // Fastest cuDNN convolution algorithm for this thunk learned from
-  // auto-tuning. If auto-tuning is disabled or failed, best_algorithm_ is set
-  // to the default value, indicating cuDNN's convolution will choose the best
-  // algorithm from some heuristics based on its parameters.
-  tensorflow::gtl::optional<perftools::gputools::dnn::AlgorithmConfig>
-      best_algorithm_;
-
-  const ConvolutionKind convolution_kind_;
+  const CudnnConvKind convolution_kind_;
 
   const BufferAllocation::Slice input_buffer_;
   const BufferAllocation::Slice filter_buffer_;
   const BufferAllocation::Slice output_buffer_;
+  const BufferAllocation::Slice tuple_result_buffer_;
+  const BufferAllocation::Slice scratch_buffer_;
 
   const Shape input_shape_;
   const Shape filter_shape_;
   const Shape output_shape_;
 
   const Window window_;
-
   const ConvolutionDimensionNumbers dim_nums_;
+  int64 algorithm_;
 };
 
-string ConvolutionKindToString(
-    ConvolutionThunk::ConvolutionKind convolution_kind);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
new file mode 100644
index 00000000000..621b2d510fa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -0,0 +1,370 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = perftools::gputools;
+
+using se::DeviceMemoryBase;
+using se::dnn::AlgorithmConfig;
+using se::dnn::AlgorithmDesc;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::optional;
+
+class ScratchAllocator : public se::ScratchAllocator {
+ public:
+  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
+
+  ~ScratchAllocator() override;
+
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
+    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
+  }
+  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
+
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override;
+
+ private:
+  const int device_ordinal_;
+  DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::DeviceMemoryBase> allocated_buffers_;
+  int64 total_allocated_bytes_ = 0;
+};
+
+ScratchAllocator::~ScratchAllocator() {
+  for (auto& allocated_buffer : allocated_buffers_) {
+    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
+             .ok()) {
+      // The program can still continue with failed deallocation.
+      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
+                 << allocated_buffer.opaque();
+    }
+  }
+}
+
+se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
+    se::Stream* stream, int64 byte_size) {
+  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
+  if (byte_size > GetMemoryLimitInBytes(stream)) {
+    return se::port::Status(
+        se::port::error::RESOURCE_EXHAUSTED,
+        tensorflow::strings::Printf(
+            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
+            byte_size, GetMemoryLimitInBytes(stream)));
+  }
+
+  auto status_or_memory =
+      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                  /*retry_on_failure=*/false);
+  if (!status_or_memory.ok()) {
+    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
+                            tensorflow::strings::Printf(
+                                "Failed to allocate %lld bytes on device %d.",
+                                byte_size, device_ordinal_));
+  }
+  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
+  allocated_buffers_.push_back(allocated_buffer);
+  total_allocated_bytes_ += byte_size;
+  return se::DeviceMemory<uint8>(allocated_buffer);
+}
+
+// Determines whether we can safely perform a winograd non-fused convolution for
+// the given input and output shapes.  This works around b/68264959, an integer
+// overflow in cuDNNv5 and cuDNNv6.
+//
+// TODO(jlebar): We shouldn't need this check for cuDNNv7.
+bool ShouldIncludeWinogradNonfusedAlgo(
+    const Shape& input_shape, const Shape& output_shape,
+    const ConvolutionDimensionNumbers& dnums) {
+  int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
+  int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
+  int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
+  int64 in_cols =
+      dnums.input_spatial_dimensions_size() == 1
+          ? 1
+          : input_shape.dimensions(dnums.input_spatial_dimensions(1));
+  int64 out_depths = output_shape.dimensions(dnums.output_feature_dimension());
+
+  int64 total_size = CeilOfRatio(batch, int64{16}) *
+                     std::max(in_depths, out_depths) * in_cols * in_rows *
+                     sizeof(float);
+
+  const int64 threshold = 1L << 31;
+  return total_size < threshold;
+}
+
+std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
+                                         bool with_winograd_nonfused,
+                                         se::StreamExecutor* stream_exec_) {
+  std::vector<AlgorithmDesc> algorithms;
+  switch (kind) {
+    case CudnnConvKind::kBackwardFilter:
+      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
+          with_winograd_nonfused, &algorithms));
+      break;
+    case CudnnConvKind::kBackwardInput:
+      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
+          with_winograd_nonfused, &algorithms));
+      break;
+    case CudnnConvKind::kForward:
+      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
+                                                &algorithms));
+      break;
+  }
+
+  // Remove any algorithms with tensor math enabled.  These have lower precision
+  // than regular algorithms, and we don't yet have a way to turn this on/off in
+  // XLA.
+  algorithms.erase(std::remove_if(algorithms.begin(), algorithms.end(),
+                                  [&](const AlgorithmDesc& a) {
+                                    return a.tensor_ops_enabled();
+                                  }),
+                   algorithms.end());
+
+  return algorithms;
+}
+
+string AlgorithmToString(const AlgorithmDesc& algo) {
+  if (algo.tensor_ops_enabled()) {
+    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
+  }
+  return tensorflow::strings::StrCat(algo.algo_id());
+}
+
+string NumBytesToString(int64 bytes) {
+  return tensorflow::strings::StrCat(
+      tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)");
+}
+
+}  // anonymous namespace
+
+// We could have caching here so that we don't redo this work for two identical
+// convolutions.  Unfortunately our cache key would have to be a tuple
+// containing the protos passed to this function, and we have no utility for
+// hashing protos.  We could write our own hash functions, but they'd silently
+// break if we ever added a field to one of the protos.  Perhaps we could hack
+// using the binary-encoded proto as the hash key, on the assumption that two
+// protos being binary-equal is a sufficient, if not necessary, condition for
+// proper equality.  But that would still leave us open to having unnecessary
+// cache misses and doing extra work.  Overall, caching doesn't seem worth the
+// trouble, but we may want to revisit this if we ever find a model where
+// caching would speed up compilation a lot.
+optional<std::pair<int64, int64>>
+CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape, const Window& window,
+    const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) {
+  // Create a stream for us to do our work on.
+  se::Stream stream{stream_exec_};
+  stream.Init();
+  const auto device_ordinal = stream_exec_->device_ordinal();
+
+  // allocator either points to this->allocator_ or, if that's null, to a
+  // StreamExecutorMemoryAllocator for stream_exec_.
+  DeviceMemoryAllocator* allocator;
+  optional<StreamExecutorMemoryAllocator> se_allocator;
+  if (allocator_ != nullptr) {
+    allocator = allocator_;
+  } else {
+    se_allocator.emplace(
+        stream_exec_->platform(),
+        tensorflow::gtl::ArraySlice<se::StreamExecutor*>({stream_exec_}));
+    allocator = &*se_allocator;
+  }
+
+  // Allocate space for the input, filter, and output of the convolution.  We
+  // use a ScratchAllocator for this instead of calling allocator_ directly so
+  // that our allocations don't leak.
+  //
+  // We don't put any data in these buffers, because (in theory, anyway) the
+  // speed of a conv isn't affected by the data being convolved.
+  ScratchAllocator input_output_allocator(device_ordinal, allocator);
+  se::port::StatusOr<DeviceMemoryBase> input_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(input_shape));
+  se::port::StatusOr<DeviceMemoryBase> filter_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(filter_shape));
+  se::port::StatusOr<DeviceMemoryBase> output_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(output_shape));
+  if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) {
+    LOG(WARNING)
+        << "Couldn't allocate space for input/filter/output of convolution "
+        << instr->ToString() << ".  Falling back to default algorithm.";
+    return nullopt;
+  }
+
+  const bool use_winograd_nonfused =
+      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
+  se::dnn::ProfileResult best_result;
+  int64 best_result_bytes_used = 0;
+  for (const AlgorithmDesc& alg :
+       GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) {
+    ScratchAllocator scratch_allocator(device_ordinal, allocator);
+    se::dnn::ProfileResult profile_result;
+    VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
+            << instr->ToString();
+
+    bool launch_ok =
+        RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                            se::DeviceMemory<float>(input_buf.ValueOrDie()),
+                            se::DeviceMemory<float>(filter_buf.ValueOrDie()),
+                            se::DeviceMemory<float>(output_buf.ValueOrDie()),
+                            &scratch_allocator, window, dnums,
+                            AlgorithmConfig(alg), &stream, &profile_result)
+            .ok();
+
+    if (launch_ok && profile_result.is_valid()) {
+      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
+              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
+              << "ms and using " << NumBytesToString(scratch_bytes_used)
+              << " of scratch (Best result: "
+              << best_result.elapsed_time_in_ms() << "ms, "
+              << NumBytesToString(best_result_bytes_used) << " of scratch)";
+      if (profile_result.elapsed_time_in_ms() <
+          best_result.elapsed_time_in_ms()) {
+        best_result = profile_result;
+        best_result_bytes_used = scratch_bytes_used;
+      }
+    } else {
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
+    }
+  }
+  if (best_result.is_valid()) {
+    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
+            << AlgorithmToString(best_result.algorithm()) << ", takes "
+            << best_result.elapsed_time_in_ms() << "ms, and uses "
+            << best_result_bytes_used << "B of scratch memory.";
+    return std::make_pair(best_result.algorithm().algo_id(),
+                          best_result_bytes_used);
+  }
+
+  LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString()
+               << " failed.  Falling back to default algorithm.";
+  return nullopt;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
+    HloInstruction* instr) {
+  CHECK(IsCustomCallToDnnConvolution(*instr));
+
+  const auto& call_target = instr->custom_call_target();
+  const auto& lhs_shape = instr->operand(0)->shape();
+  const auto& rhs_shape = instr->operand(1)->shape();
+  const auto& conv_result_shape = instr->shape().tuple_shapes(0);
+  optional<std::pair<int64, int64>> alg_and_scratch_bytes;
+  if (call_target == kCudnnConvForwardCallTarget) {
+    alg_and_scratch_bytes = PickBestAlgorithm(
+        CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
+        /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape,
+        instr->window(), instr->convolution_dimension_numbers(), instr);
+  } else if (call_target == kCudnnConvBackwardInputCallTarget) {
+    alg_and_scratch_bytes = PickBestAlgorithm(
+        CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape,
+        /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, instr->window(),
+        instr->convolution_dimension_numbers(), instr);
+  } else if (call_target == kCudnnConvBackwardFilterCallTarget) {
+    alg_and_scratch_bytes = PickBestAlgorithm(
+        CudnnConvKind::kBackwardFilter, /*input_shape=*/lhs_shape,
+        /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape,
+        instr->window(), instr->convolution_dimension_numbers(), instr);
+  } else {
+    LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+               << instr->ToString();
+  }
+
+  if (!alg_and_scratch_bytes.has_value()) {
+    return false;
+  }
+
+  int64 algorithm;
+  int64 scratch_bytes;
+  std::tie(algorithm, scratch_bytes) = *alg_and_scratch_bytes;
+
+  VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
+          << NumBytesToString(scratch_bytes)
+          << " of scratch memory: " << instr->ToString();
+
+  // Replace instr with a new CustomCall which has the correct algorithm, and
+  // whose output shape has the appropriate amount of scratch memory.
+  HloComputation* computation = instr->parent();
+  Shape new_call_shape =
+      ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
+                                 ShapeUtil::MakeShape(U8, {scratch_bytes})});
+  HloInstruction* algorithm_hlo = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(algorithm)));
+  HloInstruction* new_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          new_call_shape,
+          {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo},
+          instr->custom_call_target()));
+  new_call->set_window(instr->window());
+  new_call->set_convolution_dimension_numbers(
+      instr->convolution_dimension_numbers());
+
+  // Repackage new_call so it has the same shape as the original call, namely
+  // (conv_result, u8[0]).
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(
+          {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+               new_call_shape.tuple_shapes(0), new_call, 0)),
+           computation->AddInstruction(
+               HloInstruction::CreateConstant(Literal::CreateR1<uint8>({})))}));
+
+  TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
+  return true;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnComputation(
+    HloComputation* computation) {
+  std::vector<HloInstruction*> convs;
+  for (auto* instr : computation->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(instr);
+    }
+  }
+
+  bool changed = false;
+  for (auto* instr : convs) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
+    changed |= result;
+  }
+  return changed;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
new file mode 100644
index 00000000000..10e49daee5d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
+// each and adding explicit scratch space to the CustomCalls.
+class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
+ public:
+  // If the `allocator` parameter is not null, we will use it to allocate temp
+  // memory while timing the various convolution algorithms.  If it's null,
+  // we'll use the default allocator on the StreamExecutor.
+  CudnnConvolutionAlgorithmPicker(
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* allocator)
+      : stream_exec_(stream_exec), allocator_(allocator) {}
+
+  tensorflow::StringPiece name() const override {
+    return "cudnn-convolution-algorithm-picker";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+  StatusOr<bool> RunOnInstruction(HloInstruction* instr);
+  tensorflow::gtl::optional<std::pair<int64, int64>> PickBestAlgorithm(
+      CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+      const Shape& output_shape, const Window& window,
+      const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
+
+  perftools::gputools::StreamExecutor* stream_exec_;  // never null
+  DeviceMemoryAllocator* allocator_;                  // may be null
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
similarity index 83%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
index b0626ca3bc9..e0c73aa73ac 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 
 #include <numeric>
 #include <vector>
@@ -33,14 +33,32 @@ namespace xla {
 namespace gpu {
 
 namespace {
+
+bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
+  const ConvolutionDimensionNumbers& dnums =
+      conv->convolution_dimension_numbers();
+  if (dnums.input_spatial_dimensions_size() > 3) {
+    return false;
+  }
+
+  // CuDNN does not accept zero-element arguments
+  if (ShapeUtil::HasZeroElements(conv->operand(0)->shape()) ||
+      ShapeUtil::HasZeroElements(conv->operand(1)->shape())) {
+    return false;
+  }
+
+  if (window_util::HasWindowReversal(conv->window())) {
+    return false;
+  }
+  return true;
+}
+
 // Try to match a backward filter pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, std::vector<HloInstruction*>, Window,
-           ConvolutionDimensionNumbers>
-MatchBackwardFilter(HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
+    HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, std::vector<HloInstruction*>(), Window(),
-                      ConvolutionDimensionNumbers());
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
   // Step 1: match the instruction pattern without considering the paddings and
   // dimension numbers just yet. We may need some generic pattern matcher
   // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h
@@ -190,18 +208,15 @@ MatchBackwardFilter(HloInstruction* conv) {
     backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
   }
 
-  return std::make_tuple(true, std::vector<HloInstruction*>({conv}),
-                         backward_conv_window, backward_conv_dnums);
+  return std::make_tuple(true, backward_conv_window, backward_conv_dnums);
 }
 
 // Try to match a backward input pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, std::vector<HloInstruction*>, Window,
-           ConvolutionDimensionNumbers>
-MatchBackwardInput(HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardInput(
+    HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, std::vector<HloInstruction*>(), Window(),
-                      ConvolutionDimensionNumbers());
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
 
   // Match instruction pattern.
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
@@ -374,16 +389,63 @@ MatchBackwardInput(HloInstruction* conv) {
   dnums.set_kernel_output_feature_dimension(
       conv->convolution_dimension_numbers().kernel_input_feature_dimension());
 
-  return std::make_tuple(true,
-                         std::vector<HloInstruction*>({conv, reverse_filter}),
-                         new_window, dnums);
+  return std::make_tuple(true, new_window, dnums);
 }
-}  // namespace
 
-StatusOr<bool> ConvolutionFolding::Run(HloModule* module) {
-  HloComputation* entry_computation = module->entry_computation();
+// Tries to rewrite a single convolution into a call to cudnn.
+StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
+  CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
+
+  HloInstruction* custom_call = [&]() -> HloInstruction* {
+    bool match;
+    Window window;
+    ConvolutionDimensionNumbers dnums;
+
+    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
+    if (match) {
+      return CreateCudnnConvBackwardFilter(
+          conv->shape(), conv->mutable_operand(0), conv->mutable_operand(1),
+          window, dnums);
+    }
+
+    std::tie(match, window, dnums) = MatchBackwardInput(conv);
+    if (match) {
+      // Backward input conv subsumes the conv plus the reverse in operand 1.
+      HloInstruction* reverse = conv->mutable_operand(1);
+      CHECK_EQ(reverse->opcode(), HloOpcode::kReverse);
+      HloInstruction* rhs = reverse->mutable_operand(0);
+
+      return CreateCudnnConvBackwardInput(
+          conv->shape(), conv->mutable_operand(0), rhs, window, dnums);
+    }
+
+    // If all else fails, try a forward convolution.
+    if (CanImplementAsCudnnForwardConv(conv)) {
+      return CreateCudnnConvForward(conv->shape(), conv->mutable_operand(0),
+                                    conv->mutable_operand(1), conv->window(),
+                                    conv->convolution_dimension_numbers());
+    }
+
+    return nullptr;
+  }();
+
+  if (custom_call == nullptr) {
+    return false;
+  }
+
+  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
+  // the conv result and replace `conv` with it.
+  TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction(
+      conv,
+      HloInstruction::CreateGetTupleElement(conv->shape(), custom_call, 0)));
+  return true;
+}
+
+// Rewrites the convolutions in the given computation into calls to cudnn.
+// Returns true if it made any changes.
+StatusOr<bool> RunOnComputation(HloComputation* computation) {
   std::vector<HloInstruction*> convs;
-  for (auto* hlo : entry_computation->instructions()) {
+  for (auto* hlo : computation->instructions()) {
     if (hlo->opcode() == HloOpcode::kConvolution) {
       convs.push_back(hlo);
     }
@@ -391,41 +453,18 @@ StatusOr<bool> ConvolutionFolding::Run(HloModule* module) {
 
   bool changed = false;
   for (HloInstruction* conv : convs) {
-    bool match;
-    std::vector<HloInstruction*> hlos_to_fuse;
-    Window window;
-    ConvolutionDimensionNumbers dnums;
-    std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardFilter(conv);
-    if (match) {
-      VLOG(2) << "Fuse instructions";
-      for (HloInstruction* hlo_to_fuse : hlos_to_fuse) {
-        VLOG(2) << "  " << hlo_to_fuse->ToString();
-      }
-      HloInstruction* backward_convolution =
-          entry_computation->CreateFusionInstructionForBackwardConvolution(
-              hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardFilter,
-              window, dnums);
-      VLOG(2) << "to backward filter convolution";
-      VLOG(2) << "  " << backward_convolution->ToString();
-      changed = true;
-      continue;
-    }
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv));
+    changed |= result;
+  }
+  return changed;
+}
+}  // namespace
 
-    std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardInput(conv);
-    if (match) {
-      VLOG(2) << "Fuse instructions";
-      for (HloInstruction* hlo_to_fuse : hlos_to_fuse) {
-        VLOG(2) << "  " << hlo_to_fuse->ToString();
-      }
-      HloInstruction* backward_convolution =
-          entry_computation->CreateFusionInstructionForBackwardConvolution(
-              hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardInput,
-              window, dnums);
-      VLOG(2) << "to backward input convolution";
-      VLOG(2) << "  " << backward_convolution->ToString();
-      changed = true;
-      continue;
-    }
+StatusOr<bool> CudnnConvolutionRewriter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
similarity index 63%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding.h
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
index f9c898721f8..0c0578d8884 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -22,10 +22,12 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class ConvolutionFolding : public HloPassInterface {
+// Rewrites plain convolutions, backwards-filter convolutions, and
+// backwards-input convolutions into CustomCall HLOs that call into cuDNN.
+class CudnnConvolutionRewriter : public HloPassInterface {
  public:
   tensorflow::StringPiece name() const override {
-    return "convolution-folding";
+    return "cudnn-convolution-rewriter";
   }
 
   StatusOr<bool> Run(HloModule* module) override;
@@ -34,4 +36,4 @@ class ConvolutionFolding : public HloPassInterface {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
similarity index 82%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
index 34e6bdb117d..65588b6aaf2 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,23 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace gpu {
+namespace {
 
-class ConvolutionFoldingTest : public HloTestBase {
+namespace op = xla::testing::opcode_matchers;
+
+class CudnnConvolutionRewriterTest : public HloTestBase {
  public:
-  ConvolutionFoldingTest() {
+  CudnnConvolutionRewriterTest() {
     for (int i = 0; i < 2; ++i) {
       WindowDimension* window_dim = default_conv_window_.add_dimensions();
       window_dim->set_size(1);
@@ -44,7 +50,8 @@ class ConvolutionFoldingTest : public HloTestBase {
     // the batch and feature dimension in the activations, and treat the batch
     // dimension in gradients as the input feature dimension in the filter.
     //
-    // TODO(jingyue): Add more tests on NCHW input order which TF also supports.
+    // TODO(jingyue): Add more tests on NCHW input order, which TF also
+    // supports.
     tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3);
     tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0);
     tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1);
@@ -74,9 +81,8 @@ class ConvolutionFoldingTest : public HloTestBase {
   }
 
  protected:
-  bool FoldConvolution(HloModule* module) {
-    ConvolutionFolding convolution_folding;
-    return convolution_folding.Run(module).ValueOrDie();
+  bool RunPass(HloModule* module) {
+    return CudnnConvolutionRewriter().Run(module).ValueOrDie();
   }
 
   // A convolution window with stride 1 and zero padding. The size fields are
@@ -86,7 +92,7 @@ class ConvolutionFoldingTest : public HloTestBase {
   ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_;
 };
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolve) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -108,14 +114,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardFilterConvolveEquivalentToForwardConvolution) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
@@ -135,12 +140,17 @@ TEST_F(ConvolutionFoldingTest,
       tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
 // Extracted from block35 training.
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardFilterConvolveWithPaddedActivations) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -162,15 +172,15 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
 // Extracted from inception v3 training.
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardFilterConvolveWithPaddedGradients) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -192,14 +202,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -221,14 +230,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveEvenPadding) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -272,14 +280,15 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim =
-        entry_computation->root_instruction()->window().dimensions(i);
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
     // Low padding of the backward input convolution
     //   = kernel_size - 1 - low padding on gradients.
     EXPECT_EQ(3, window_dim.padding_low());
@@ -291,7 +300,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
 // Convolve([abc], [x], base_dilation=2)
 //   = Convolve([abc], Reverse([x]), base_dilation=2)
 //   = BackwardInputConvolve([abc], [x], stride=2)
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolve1x1Filter) {
   auto builder = HloComputation::Builder(TestName());
   // NHWC dimension order.
   HloInstruction* output =
@@ -316,17 +325,16 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
 }
 
 // BackwardInputConvolve([abc], [x], stride=1) is equivalent to
 // ForwardConvolve([abc], [x], stride=1). No need to fold it into backward input
 // convolution.
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolve1x1FilterEquivalentToForwardConvolve) {
   auto builder = HloComputation::Builder(TestName());
   // NHWC dimension order.
@@ -347,8 +355,12 @@ TEST_F(ConvolutionFoldingTest,
       tf_default_dnums_for_backward_input_));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
 // Extracted from Inception V3 training.
@@ -365,7 +377,8 @@ TEST_F(ConvolutionFoldingTest,
 //                     20x10x10x192
 //
 // Gradients are padded unevenly.
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardInputConvolveUnevenPaddingOnGradients) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -397,14 +410,14 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim =
-        entry_computation->root_instruction()->window().dimensions(i);
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
     EXPECT_EQ(0, window_dim.padding_low());
     EXPECT_EQ(0, window_dim.padding_high());
     EXPECT_EQ(2, window_dim.stride());
@@ -413,7 +426,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
 
 // Similar to BackwardInputConvolveUnevenPadding, but the low padding of the
 // gradients exceeds kernel_size - 1. Therefore, this pattern cannot be fused.
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -442,8 +455,12 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
                          .ValueOrDie()));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
 // Extracted from //learning/brain/google/xla/benchmarks/resnet.py
@@ -460,7 +477,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
 //
 // We should fuse BC even though padding on activations is uneven, because
 // PadInsertion will canonicalize the fusion HLO.
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolveUnevenPaddingOnActivations) {
   auto builder = HloComputation::Builder(TestName());
   // The gradients are in NCHW layout.
@@ -493,13 +510,12 @@ TEST_F(ConvolutionFoldingTest,
   auto module = CreateNewModule();
   const HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  const HloInstruction* backward_conv = entry_computation->root_instruction();
-  EXPECT_EQ(HloOpcode::kFusion, backward_conv->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              backward_conv->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
   const WindowDimension& backward_conv_col_dim =
-      backward_conv->window().dimensions(1);
+      entry_computation->root_instruction()->operand(0)->window().dimensions(1);
   EXPECT_EQ(0, backward_conv_col_dim.padding_low());
   EXPECT_EQ(1, backward_conv_col_dim.padding_high());
 }
@@ -515,7 +531,7 @@ TEST_F(ConvolutionFoldingTest,
 //
 // We currently don't fuse BC because PadInsertion doesn't support negative
 // padding on the gradients of backward convolution (b/32744257).
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolveNegativePaddingHighOnActivations) {
   auto builder = HloComputation::Builder(TestName());
   // The gradients are in NCHW layout.
@@ -544,9 +560,14 @@ TEST_F(ConvolutionFoldingTest,
                          .ValueOrDie()));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
+}  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
new file mode 100644
index 00000000000..f5f52cf62bf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -0,0 +1,221 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = ::perftools::gputools;
+
+using se::DeviceMemory;
+using se::DeviceMemoryBase;
+using se::Stream;
+using se::dnn::AlgorithmConfig;
+using se::dnn::BatchDescriptor;
+using se::dnn::ConvolutionDescriptor;
+using se::dnn::DataLayout;
+using se::dnn::DimIndex;
+using se::dnn::FilterDescriptor;
+using se::dnn::FilterLayout;
+using se::dnn::ProfileResult;
+
+// A StreamExecutor ScratchAllocator that wraps a single XLA allocation,
+// returning it (in its entirety) the first time Allocate() is called.
+class ScratchBufAllocator : public se::ScratchAllocator {
+ public:
+  explicit ScratchBufAllocator(se::DeviceMemoryBase scratch)
+      : scratch_(scratch) {}
+
+  ~ScratchBufAllocator() override = default;
+
+  int64 GetMemoryLimitInBytes(se::Stream* /*stream*/) override {
+    return scratch_.size();
+  }
+
+  se::port::StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
+    if (allocated_) {
+      return se::port::InternalError(
+          "Can't allocate twice from a ScratchBufAllocator.");
+    }
+    if (byte_size > scratch_.size()) {
+      return se::port::InternalError(tensorflow::strings::StrCat(
+          "Can't allocate ", byte_size,
+          " bytes from a ScratchBufAllocator of size ", scratch_.size()));
+    }
+
+    allocated_ = true;
+    return se::DeviceMemory<uint8>(scratch_);
+  }
+
+ private:
+  se::DeviceMemoryBase scratch_;
+  bool allocated_ = false;
+};
+
+}  // anonymous namespace
+
+string CudnnConvKindToString(CudnnConvKind kind) {
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      return "forward";
+    case CudnnConvKind::kBackwardFilter:
+      return "backward_filter";
+    case CudnnConvKind::kBackwardInput:
+      return "backward_input";
+  }
+}
+
+Status RunCudnnConvolution(CudnnConvKind kind, const Shape& input_shape,
+                           const Shape& filter_shape, const Shape& output_shape,
+                           DeviceMemory<float> input_buf,
+                           DeviceMemory<float> filter_buf,
+                           DeviceMemory<float> output_buf,
+                           DeviceMemoryBase scratch_buf, const Window& window,
+                           const ConvolutionDimensionNumbers& dnums,
+                           AlgorithmConfig algorithm, Stream* stream,
+                           ProfileResult* profile_result /*= nullptr*/) {
+  ScratchBufAllocator scratch_allocator(scratch_buf);
+  return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                             input_buf, filter_buf, output_buf,
+                             &scratch_allocator, window, dnums, algorithm,
+                             stream, profile_result);
+}
+
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape, DeviceMemory<float> input_buf,
+    DeviceMemory<float> filter_buf, DeviceMemory<float> output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums, AlgorithmConfig algorithm,
+    Stream* stream, ProfileResult* profile_result /*= nullptr*/) {
+  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(kind);
+  VLOG(3) << "input shape: { " << ShapeUtil::HumanString(input_shape) << " }";
+  VLOG(3) << "filter shape: { " << ShapeUtil::HumanString(filter_shape) << " }";
+  VLOG(3) << "Output shape: { " << ShapeUtil::HumanString(output_shape) << " }";
+  VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
+  VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
+
+  const int num_dimensions = window.dimensions_size();
+  CHECK_LE(num_dimensions, 3);
+  // cuDNN does not support 1D convolutions. We therefore express 1D
+  // convolutions as 2D convolutions where the first spatial dimension is 1.
+  // This matches the behavior of TF (see definition of conv1d in
+  // tensorflow/python/ops/nn_ops.py).
+  const int effective_num_dimensions = std::max(2, num_dimensions);
+
+  CHECK_EQ(F32, output_shape.element_type())
+      << ShapeUtil::HumanString(output_shape);
+  CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
+  for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dim.padding_low(), dim.padding_high());
+  }
+
+  // cuDNN's convolution APIs support the BDYX layout for activations/output and
+  // the OIYX layout for weights.
+  BatchDescriptor input_descriptor(effective_num_dimensions);
+  input_descriptor.set_layout(DataLayout::kBatchDepthYX)
+      .set_feature_map_count(
+          input_shape.dimensions(dnums.input_feature_dimension()))
+      .set_count(input_shape.dimensions(dnums.input_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    // Note that the dimensions are reversed. The same holds below.
+    input_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        input_shape.dimensions(dnums.input_spatial_dimensions(dim)));
+  }
+
+  FilterDescriptor filter_descriptor(effective_num_dimensions);
+  filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
+      .set_input_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_input_feature_dimension()))
+      .set_output_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_output_feature_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    filter_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        filter_shape.dimensions(dnums.kernel_spatial_dimensions(dim)));
+  }
+
+  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    convolution_descriptor
+        .set_zero_padding(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).padding_low())
+        .set_filter_stride(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).stride());
+  }
+
+  BatchDescriptor output_descriptor(effective_num_dimensions);
+  output_descriptor.set_layout(DataLayout::kBatchDepthYX)
+      .set_feature_map_count(
+          output_shape.dimensions(dnums.output_feature_dimension()))
+      .set_count(output_shape.dimensions(dnums.output_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    output_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        output_shape.dimensions(dnums.output_spatial_dimensions(dim)));
+  }
+
+  // Add a singleton dimension in the 1D convolution case.
+  if (num_dimensions == 1) {
+    input_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    output_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    filter_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    convolution_descriptor.set_zero_padding(static_cast<DimIndex>(0), 0)
+        .set_filter_stride(static_cast<DimIndex>(0), 1);
+  }
+
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      stream->ThenConvolveWithAlgorithm(
+          input_descriptor, input_buf, filter_descriptor, filter_buf,
+          convolution_descriptor, output_descriptor, &output_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardInput:
+      stream->ThenConvolveBackwardDataWithAlgorithm(
+          filter_descriptor, filter_buf, output_descriptor, output_buf,
+          convolution_descriptor, input_descriptor, &input_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardFilter:
+      stream->ThenConvolveBackwardFilterWithAlgorithm(
+          input_descriptor, input_buf, output_descriptor, output_buf,
+          convolution_descriptor, filter_descriptor, &filter_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+  }
+
+  if (!stream->ok()) {
+    return InternalError(
+        "Unable to launch convolution with type %s and algorithm (%lld, %lld)",
+        CudnnConvKindToString(kind).c_str(), algorithm.algorithm().algo_id(),
+        algorithm.algorithm_no_scratch().algo_id());
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
new file mode 100644
index 00000000000..b101f76510c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
+
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// This file contains low-level routines for running cudnn convolutions.
+
+// Different types of convolutions supported by cudnn.
+//
+// A way to think about these is that a convolution is defined by three arrays
+// -- the "input", the "filter", and the "output" -- and given any two of these,
+// we can compute the third.  For example, a backward-input convolution takes as
+// input a filter and an "output" and produces an "input" such that if one were
+// to do a forward convolution of "input" using filter, the result would be
+// something with the same shape as "output".
+//
+// This way of thinking is not correct if you look at the values produced. For
+// example, a backward-input convolution is not actually the mathematical
+// inverse of a forward convolution.  But it's right as far as the shapes and
+// "connectivity" (i.e. which elements of the input affect which elements of
+// the output) are concerned.
+enum class CudnnConvKind {
+  kForward,         // input  + filter => output
+  kBackwardInput,   // filter + output => input
+  kBackwardFilter,  // input  + output => filter
+};
+
+// Converts a CudnnConvKind value to a string.
+string CudnnConvKindToString(CudnnConvKind kind);
+
+// Calls into cudnn to run the specified convolution.
+//
+// Note that depending on the value of CudnnConvKind, the result of this call
+// may be written into input_buf, filter_buf, or output_buf!
+//
+// At the moment we only support cudnn convolutions over floats.
+//
+// We provide one overload which takes a scratch buffer, and another which takes
+// an allocator which is responsible for allocating the scratch space.  In
+// theory the second one shouldn't be necessary -- users of this function could
+// just ask cudnn how much scratch space it needs for a particular convolution.
+// But in practice, StreamExecutor does not expose such an API, and in the name
+// of parsimony, perhaps it's better not to add it.  Instead, the first time you
+// call a convolution, you should call the version that takes a scratch
+// allocator and take note of how much memory is used.  The next time you call
+// the same conv, you can provide an explicitly preallocated scratch buffer of
+// that size, if you like.
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape,
+    perftools::gputools::DeviceMemory<float> input_buf,
+    perftools::gputools::DeviceMemory<float> filter_buf,
+    perftools::gputools::DeviceMemory<float> output_buf,
+    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    perftools::gputools::dnn::AlgorithmConfig algorithm,
+    perftools::gputools::Stream* stream,
+    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape,
+    perftools::gputools::DeviceMemory<float> input_buf,
+    perftools::gputools::DeviceMemory<float> filter_buf,
+    perftools::gputools::DeviceMemory<float> output_buf,
+    perftools::gputools::ScratchAllocator* scratch_allocator,
+    const Window& window, const ConvolutionDimensionNumbers& dnums,
+    perftools::gputools::dnn::AlgorithmConfig algorithm,
+    perftools::gputools::Stream* stream,
+    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 0cca3ca0926..12ec266ff38 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -35,8 +35,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
@@ -127,7 +128,9 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
 }
 
 // Runs optimization passes on the given HLO module.
-tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
+tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
+                                     se::StreamExecutor* stream_exec,
+                                     DeviceMemoryAllocator* device_allocator) {
   {
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>();
@@ -143,6 +146,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
     // most ops.
     pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
     pipeline.AddPass<DotDecomposer>();
+
     {
       auto& pass =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
@@ -173,7 +177,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
       pass.AddPass<ReshapeMover>();
       pass.AddPass<HloConstantFolding>();
     }
-    pipeline.AddPass<ConvolutionFolding>();
+
     pipeline.AddPass<TransposeFolding>(
         [](const HloInstruction& dot,
            const TransposeFolding::OperandIndices& candidate_operands) {
@@ -185,6 +189,58 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module) {
     pipeline.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
+
+  {
+    // Convert convolutions into CustomCalls to cudnn, then canonicalize them
+    // (PadInsertion).
+    HloPassPipeline pipeline("conv_canonicalization");
+    pipeline.AddInvariantChecker<HloVerifier>();
+    pipeline.AddPass<CudnnConvolutionRewriter>();
+    pipeline.AddPass<PadInsertion>();
+
+    // Choose the fastest algorithm for each conv.
+    //
+    // In theory doing this here is way too early: It needs to happen after
+    // layout assignment, because the layout of the inputs/outputs affects the
+    // speed of the conv.  But currently we only allow only one input/output
+    // layout when calling cudnn, so there's no ambiguity.
+    //
+    // We pick the algorithm at this early stage so we can generate better HLO.
+    // After CudnnConvolutionRewriter, our convolutions are CustomCalls which
+    // return a tuple (conv_result, scratch_memory), and the each conv uses 0
+    // bytes of scratch:
+    //
+    //   customcall = (f32[...], f32[0])
+    //   return gte(customcall, 0)
+    //
+    // The algorithm picker then chooses the best algorithm, and potentially
+    // increases the scratch space.  It replaces customcall with new_tuple,
+    // giving us the following:
+    //
+    //   new_customcall = (f32[...], f32[N])
+    //   new_tuple = tuple(gte(new_customcall, 0), constant f32[0])
+    //   return gte(new_tuple, 0)
+    //
+    // The new tuple and gte instructions then be simplified away, because
+    // nobody is expected to use the scratch value.
+    //
+    // However, if we were to run CudnnConvolutionAlgorithmPicker after layout
+    // assignment, fusion would already have run, and the gte(customcall, 0)
+    // would probably already be into a fusion node.  We can't simplify across
+    // HloComputation boundaries, so in this case we wouldn't be able to
+    // simplify away the new_tuple bits.
+    //
+    // We'll need to revisit this if we ever allow multiple layouts for the
+    // inputs/outputs of a cudnn convolution.
+    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(stream_exec,
+                                                      device_allocator);
+    // Clean up new_tuple described above.
+    pipeline.AddPass<TupleSimplifier>();
+    pipeline.AddPass<HloDCE>();
+
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
     fusion.AddInvariantChecker<HloVerifier>();
@@ -220,9 +276,10 @@ tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
   pipeline.AddInvariantChecker<HloVerifier>();
-  pipeline.AddPass<PadInsertion>();
+
   pipeline.AddPass<GpuLayoutAssignment>(
       hlo_module->mutable_entry_computation_layout());
+
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
@@ -410,16 +467,19 @@ GpuCompiler::GpuCompiler()
                         .getPointerSize(0 /* default address space */)) {}
 
 StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
   Tracing::TraceMe annotation("HLO Transforms", module->name(),
                               /*is_expensive=*/true);
-  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get()));
+  TF_RETURN_IF_ERROR(
+      OptimizeHloModule(module.get(), stream_exec, device_allocator));
   return std::move(module);
 }
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
 
   TF_RET_CHECK(stream_exec != nullptr);
@@ -459,16 +519,17 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                           /*color_alignment=*/[](LogicalBuffer::Color) {
                             return kCudaMallocAlignBytes;
                           }));
-  // BufferAssignment::ToString() includes a header, so no need for us to
-  // print one ourselves.
+  // BufferAssignment::Stats::ToString() and BufferAssignment::ToString()
+  // include headers, so no need for us to print them ourselves.
+  XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
   XLA_VLOG_LINES(2, module->ToString());
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
-  if (!xla_dump_hlo_proto_to.empty()) {
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+  if (!xla_dump_optimized_hlo_proto_to.empty()) {
     HloProto proto = MakeHloProto(*module, *buffer_assignment);
     TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_hlo_proto_to, module->name()));
+        proto, xla_dump_optimized_hlo_proto_to, module->name()));
   }
 
   IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 18e34340205..c352d4d8462 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -51,11 +51,13 @@ class GpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index e3b493c6630..88bf5a74fa0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -78,6 +78,12 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
       for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
         TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
+    } else if (IsCustomCallToDnnConvolution(*hlo)) {
+      // The last argument to a CUDNN convolution is its algorithm, which must
+      // be an HLO constant -- it shouldn't be copied.
+      for (int64 i = 0; i < hlo->operand_count() - 1; ++i) {
+        TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
+      }
     } else if (ImplementedAsLibraryCall(*hlo)) {
       // For all other library calls, materialize all the operands into memory.
       for (int64 i = 0; i < hlo->operand_count(); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 58915f1f62f..89f1e625884 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -28,122 +28,114 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// cuDNN convolutions are called with specific layouts on the input, output,
+// and filter:
+//
+//   input: DataLayout::kBatchDepthYX
+//   output: DataLayout::kBatchDepthYX
+//   filter: FilterLayout::kOutputInputYX
+//
+// The order dimensions in the constant name is major-to-minor (eg, the
+// most-major dimension of the input is batch, most-minor is X). The
+// specific dimension numbers these named dimensions correspond to is
+// determined by the ConvolutionDimensionNumbers argument. Y is spatial
+// dimension 0, and X is spatial dimension 1.
+//
+// TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
+static Status AddBackendConstraintsToDnnConvCustomCall(
+    HloInstruction* instr, LayoutConstraints* constraints) {
+  CHECK(IsCustomCallToDnnConvolution(*instr)) << instr->ToString();
+  Shape input_shape;
+  Shape filter_shape;
+  Shape output_shape;
+  const auto& target = instr->custom_call_target();
+  if (target == kCudnnConvForwardCallTarget) {
+    input_shape = instr->operand(0)->shape();
+    filter_shape = instr->operand(1)->shape();
+    output_shape = instr->shape().tuple_shapes(0);
+  } else if (target == kCudnnConvBackwardInputCallTarget) {
+    input_shape = instr->shape().tuple_shapes(0);
+    filter_shape = instr->operand(1)->shape();
+    output_shape = instr->operand(0)->shape();
+  } else if (target == kCudnnConvBackwardFilterCallTarget) {
+    input_shape = instr->operand(0)->shape();
+    filter_shape = instr->shape().tuple_shapes(0);
+    output_shape = instr->operand(1)->shape();
+  } else {
+    LOG(FATAL) << "Unexpected custom call target: "
+               << instr->custom_call_target();
+  }
+
+  // Construct minor-to-major dimension orders for operands and result.
+  // cuDNN's convolution APIs support the BDYX layout for activations/output
+  // and the OIYX layout for weights.
+  // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
+  // calls after we switch to cuDNN v5.
+  const ConvolutionDimensionNumbers& dimension_numbers =
+      instr->convolution_dimension_numbers();
+  std::vector<int64> input_layout;
+  for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
+  }
+  input_layout.push_back(dimension_numbers.input_feature_dimension());
+  input_layout.push_back(dimension_numbers.input_batch_dimension());
+  *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
+
+  std::vector<int64> filter_layout;
+  for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
+  }
+  filter_layout.push_back(dimension_numbers.kernel_input_feature_dimension());
+  filter_layout.push_back(dimension_numbers.kernel_output_feature_dimension());
+  *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
+
+  std::vector<int64> output_layout;
+  for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
+  }
+  output_layout.push_back(dimension_numbers.output_feature_dimension());
+  output_layout.push_back(dimension_numbers.output_batch_dimension());
+  *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
+
+  // The custom call returns a tuple of (actual_result, scratch_buffer);
+  // call_result_buf is the logical buffer for actual_result, the thing that
+  // contains the result of the conv call.
+  TF_ASSIGN_OR_RETURN(const LogicalBuffer* call_result_buf,
+                      constraints->points_to_analysis().GetBufferDefinedAt(
+                          instr, /*index=*/{0}));
+
+  // Set layouts of the instructions' shapes.
+  if (target == kCudnnConvForwardCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(output_shape.layout(), *call_result_buf));
+  } else if (target == kCudnnConvBackwardInputCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(input_shape.layout(), *call_result_buf));
+  } else if (target == kCudnnConvBackwardFilterCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(filter_shape.layout(), *call_result_buf));
+  } else {
+    LOG(FATAL) << "Unexpected custom call target: "
+               << instr->custom_call_target();
+  }
+  return Status::OK();
+}
+
 Status GpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
   for (auto* instruction : constraints->computation()->instructions()) {
-    // cuDNN is called with specific layouts on the input, output, and filter:
-    //
-    //   input: DataLayout::kBatchDepthYX
-    //   output: DataLayout::kBatchDepthYX
-    //   filter: FilterLayout::kOutputInputYX
-    //
-    // The order dimensions in the constant name is major-to-minor (eg, the
-    // most-major dimension of the input is batch, most-minor is X). The
-    // specific dimension numbers these named dimensions correspond to is
-    // determined by the ConvolutionDimensionNumbers argument. Y is spatial
-    // dimension 0, and X is spatial dimension 1.
-    //
-    // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
-    if (ImplementedAsDnnConvolution(*instruction)) {
-      HloInstruction* input = nullptr;
-      HloInstruction* filter = nullptr;
-      HloInstruction* output = nullptr;
-      if (instruction->opcode() == HloOpcode::kConvolution) {
-        input = instruction->mutable_operand(0);
-        filter = instruction->mutable_operand(1);
-        output = instruction;
-      } else {
-        CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
-        switch (instruction->fusion_kind()) {
-          case HloInstruction::FusionKind::kConvBackwardFilter:
-            // filter = BackwardFilterConvolve(input, output)
-            input = instruction->mutable_operand(0);
-            filter = instruction;
-            output = instruction->mutable_operand(1);
-            break;
-          case HloInstruction::FusionKind::kConvBackwardInput:
-            // input = BackwardInputConvolve(output, filter)
-            input = instruction;
-            filter = instruction->mutable_operand(1);
-            output = instruction->mutable_operand(0);
-            break;
-          default:
-            LOG(FATAL) << "Not a convolution-fusion";
-        }
-      }
-
-      // Construct minor-to-major dimension orders for operands and result.
-      // cuDNN's convolution APIs support the BDYX layout for activations/output
-      // and the OIYX layout for weights.
-      // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
-      // calls after we switch to cuDNN v5.
-      const ConvolutionDimensionNumbers& dimension_numbers =
-          instruction->convolution_dimension_numbers();
-      std::vector<int64> input_layout;
-      for (int i = dimension_numbers.input_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
-      }
-      input_layout.push_back(dimension_numbers.input_feature_dimension());
-      input_layout.push_back(dimension_numbers.input_batch_dimension());
-      Shape input_shape(input->shape());
-      *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
-
-      std::vector<int64> filter_layout;
-      for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
-      }
-      filter_layout.push_back(
-          dimension_numbers.kernel_input_feature_dimension());
-      filter_layout.push_back(
-          dimension_numbers.kernel_output_feature_dimension());
-      Shape filter_shape(filter->shape());
-      *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
-
-      std::vector<int64> output_layout;
-      for (int i = dimension_numbers.output_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
-      }
-      output_layout.push_back(dimension_numbers.output_feature_dimension());
-      output_layout.push_back(dimension_numbers.output_batch_dimension());
-      Shape output_shape(output->shape());
-      *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
-
-      // Set layouts of the instructions' shapes.
-      if (instruction->opcode() == HloOpcode::kConvolution) {
-        TF_RETURN_IF_ERROR(
-            constraints->SetOperandLayout(input_shape, output, 0));
-        TF_RETURN_IF_ERROR(
-            constraints->SetOperandLayout(filter_shape, output, 1));
-        TF_RETURN_IF_ERROR(
-            constraints->SetInstructionLayout(output_shape, output));
-      } else {
-        CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
-        switch (instruction->fusion_kind()) {
-          case HloInstruction::FusionKind::kConvBackwardFilter:
-            // filter = BackwardFilterConvolve(input, output)
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(input_shape, filter, 0));
-            TF_RETURN_IF_ERROR(
-                constraints->SetInstructionLayout(filter_shape, filter));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(output_shape, filter, 1));
-            break;
-          case HloInstruction::FusionKind::kConvBackwardInput:
-            // input = BackwardInputConvolve(output, filter)
-            TF_RETURN_IF_ERROR(
-                constraints->SetInstructionLayout(input_shape, input));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(output_shape, input, 0));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(filter_shape, input, 1));
-            break;
-          default:
-            LOG(FATAL) << "Not a convolution-fusion";
-        }
-      }
+    if (IsCustomCallToDnnConvolution(*instruction)) {
+      TF_RETURN_IF_ERROR(
+          AddBackendConstraintsToDnnConvCustomCall(instruction, constraints));
     }
   }
   return Status::OK();
@@ -151,9 +143,12 @@ Status GpuLayoutAssignment::AddBackendConstraints(
 
 bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout(
     const HloInstruction* instruction) {
-  // Inputs to cudnn batchnorm custom calls don't need the major-first layout
-  // (i.e. {n, n-1, ...0}) -- we can handle any layout.
-  return !IsCustomCallToDnnBatchNorm(*instruction);
+  // - Inputs to cudnn batchnorm custom calls don't need the major-first layout
+  //   (i.e. {n, n-1, ...0}) -- we can handle any layout.
+  // - Inputs to cudnn convolution require custom layouts handled in
+  //   AddBackendConstraints.
+  return !IsCustomCallToDnnBatchNorm(*instruction) &&
+         !IsCustomCallToDnnConvolution(*instruction);
 }
 
 Status GpuLayoutAssignment::PropagateOperandConstraint(
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index c2115c49993..dd4426ca7b9 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -191,7 +191,11 @@ static bool BuffersInvariantWithinConsumer(
 llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
                                              const HloInstruction& consumer,
                                              const ShapeIndex& shape_index) {
-  llvm_ir::IrArray ir_array(GetBasePointer(hlo, shape_index),
+  llvm::Value* base_ptr = GetBasePointer(hlo, shape_index);
+  CHECK_NE(base_ptr, nullptr)
+      << "Buffer not assigned for shape_index " << shape_index.ToString()
+      << " of " << hlo.ToString();
+  llvm_ir::IrArray ir_array(base_ptr,
                             ShapeUtil::GetSubshape(hlo.shape(), shape_index));
   alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array);
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 1d47ffde433..2d6dad27a59 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -137,49 +137,6 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
                    .ValueOrDie());
 }
 
-TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfConvolutionUnfused) {
-  HloComputation::Builder builder(TestName());
-  auto input = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 1, 1, 3}), "input"));
-  auto filter = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 1, 1, 2}), "filter"));
-
-  Window conv_window;
-  WindowDimension* conv_window_row = conv_window.add_dimensions();
-  conv_window_row->set_size(1);
-  WindowDimension* conv_window_col = conv_window.add_dimensions();
-  conv_window_col->set_size(2);
-  conv_window_col->set_padding_high(1);
-
-  ConvolutionDimensionNumbers conv_dnums;
-  conv_dnums.set_input_batch_dimension(0);
-  conv_dnums.set_output_batch_dimension(0);
-  conv_dnums.set_input_feature_dimension(1);
-  conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_input_spatial_dimensions(2);
-  conv_dnums.add_output_spatial_dimensions(2);
-  conv_dnums.add_input_spatial_dimensions(3);
-  conv_dnums.add_output_spatial_dimensions(3);
-  conv_dnums.set_kernel_output_feature_dimension(0);
-  conv_dnums.set_kernel_input_feature_dimension(1);
-  conv_dnums.add_kernel_spatial_dimensions(2);
-  conv_dnums.add_kernel_spatial_dimensions(3);
-
-  auto conv = builder.AddInstruction(
-      HloInstruction::CreateConvolve(ShapeUtil::MakeShape(F32, {1, 1, 1, 3}),
-                                     input, filter, conv_window, conv_dnums));
-  auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 1, 1, 1}), conv, {3, 2, 1, 0}));
-  builder.AddInstruction(
-      HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), transpose));
-
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie());
-}
-
 TEST_F(InstructionFusionTest, GetTupleElementFused) {
   HloComputation::Builder builder(TestName());
   Shape data_shape = ShapeUtil::MakeShape(F32, {8});
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 76566a9e3db..2f65edffea8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -90,43 +90,6 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
   return false;
 }
 
-bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
-  // We can only do this if the HLO is unnested.
-  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
-    return false;
-  }
-
-  // Forward convolution.
-  if (hlo.opcode() == HloOpcode::kConvolution) {
-    const ConvolutionDimensionNumbers& dnums =
-        hlo.convolution_dimension_numbers();
-    if (dnums.input_spatial_dimensions_size() > 3) {
-      return false;
-    }
-
-    // CuDNN does not accept zero-element arguments
-    if (ShapeUtil::HasZeroElements(hlo.operand(0)->shape()) ||
-        ShapeUtil::HasZeroElements(hlo.operand(1)->shape())) {
-      return false;
-    }
-
-    if (window_util::HasWindowReversal(hlo.window())) {
-      return false;
-    }
-
-    return true;
-  }
-
-  // Backward convolution.
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      (hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardFilter ||
-       hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardInput)) {
-    return true;
-  }
-
-  return false;
-}
-
 const char* const kCudnnBatchNormForwardInferenceCallTarget =
     "__cudnn$batchNormalizationForwardInference";
 const char* const kCudnnBatchNormForwardTrainingCallTarget =
@@ -144,9 +107,76 @@ bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo) {
          target == kCudnnBatchNormBackwardCallTarget;
 }
 
+const char* const kCudnnConvForwardCallTarget = "__cudnn$convForward";
+const char* const kCudnnConvBackwardInputCallTarget =
+    "__cudnn$convBackwardInput";
+const char* const kCudnnConvBackwardFilterCallTarget =
+    "__cudnn$convBackwardFilter";
+
+bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) {
+  if (hlo.opcode() != HloOpcode::kCustomCall) {
+    return false;
+  }
+  const auto& target = hlo.custom_call_target();
+  return target == kCudnnConvForwardCallTarget ||
+         target == kCudnnConvBackwardInputCallTarget ||
+         target == kCudnnConvBackwardFilterCallTarget;
+}
+
 bool ImplementedAsLibraryCall(const HloInstruction& hlo) {
-  return ImplementedAsGemm(hlo) || ImplementedAsDnnConvolution(hlo) ||
-         IsCustomCallToDnnBatchNorm(hlo);
+  return ImplementedAsGemm(hlo) || IsCustomCallToDnnBatchNorm(hlo) ||
+         IsCustomCallToDnnConvolution(hlo);
+}
+
+static HloInstruction* CreateCudnnConv(
+    const char* call_target, const Shape& shape, HloInstruction* lhs,
+    HloInstruction* rhs, const Window& window,
+    const ConvolutionDimensionNumbers& dnums) {
+  HloComputation* computation = lhs->parent();
+
+  // This call returns a tuple of (conv_result, scratch_memory), where
+  // conv_result is the actual result of the convolution, and scratch_memory is
+  // temporary memory used by cudnn.
+  //
+  // At the moment, we don't know how much scratch memory this conv is going to
+  // use, so we put u8[0] in this place.  Later on another pass will choose
+  // which conv algorithm to use, and at that point we'll modify the shape of
+  // this second tuple element.
+  Shape call_shape =
+      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
+
+  // Our CustomCall takes three arguments: The conv lhs and rhs, and the cudnn
+  // algorithm to use.  It's up to a later pass to choose the algorithm, so to
+  // indicate that we haven't yet made a choice, we speicfy -1 for that arg.
+  HloInstruction* negative_one = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
+  HloInstruction* custom_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          call_shape, {lhs, rhs, negative_one}, call_target));
+  custom_call->set_window(window);
+  custom_call->set_convolution_dimension_numbers(dnums);
+  return custom_call;
+}
+
+HloInstruction* CreateCudnnConvForward(
+    const Shape& shape, HloInstruction* input, HloInstruction* kernel,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvForwardCallTarget, shape, input, kernel,
+                         window, dnums);
+}
+
+HloInstruction* CreateCudnnConvBackwardInput(
+    const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, shape, output,
+                         reverse_filter, window, dnums);
+}
+
+HloInstruction* CreateCudnnConvBackwardFilter(
+    const Shape& shape, HloInstruction* input, HloInstruction* output,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, shape, input,
+                         output, window, dnums);
 }
 
 bool IsReductionToVector(const HloInstruction& reduce) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index d24ed9879d0..7ad9680bfb4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
+// TODO(jlebar): Move functions related to cublas/cudnn to a separate file; they
+// don't belong in "ir_emission_utils".
+
 namespace xla {
 namespace gpu {
 
@@ -30,9 +33,6 @@ constexpr int64 kWarpSize = 32;
 // Returns true if `hlo` will be implemented as a call to BLAS gemm.
 bool ImplementedAsGemm(const HloInstruction& hlo);
 
-// Returns true if `hlo` will be implemented as a call to cuDNN convolution.
-bool ImplementedAsDnnConvolution(const HloInstruction& hlo);
-
 // A call to cuDNN for batch normalization is represented as CustomCall HLO with
 // a call target equal to one of these strings.
 //
@@ -58,6 +58,60 @@ extern const char* const kCudnnBatchNormBackwardCallTarget;
 // sequence of generic HLOs or to a cuDNN CustomCall.
 bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo);
 
+// A call to cuDNN for convolution (forward, backward filter, or backward input)
+// is represented as a CustomCall HLO with a call target equal to one of these
+// strings.
+//
+// These CustomCalls have window() and convolution_dimension_numbers() set like
+// regular convolution ops.  They have the same LHS and RHS operands, plus one
+// additional int64 operand, representing which cudnn algorithm to run.  This
+// operand must be an HLO constant.  A value of -1 means that the implementation
+// is free to choose the best algorithm it can.
+//
+// These calls output a tuple (conv_result, scratch_memory), where conv_result
+// is the actual result of the convolution, and scratch_memory is temporary
+// memory used by cudnn.  Callers shouldn't inspect scratch_memory, as its value
+// is not well-defined.
+//
+// CudnnConvolutionRewriter lowers kConvolution HLOs to these custom calls.
+// When it does so, it chooses algorithm -1 and 0 bytes of scratch space.  Later
+// on in the pipeline, CudnnConvolutionAlgorithmChooser chooses an explicit
+// algorithm for each conv and sets the amount of scratch space needed.
+//
+// (Representing the scratch memory as an output may seem strange at first, but
+// it's quite sensible, from a certain point of view.  The scratch buffer is a
+// location in memory that the conv can write into, but which it can't legally
+// read from, at least until it's written something first.  But that's exactly
+// the definition of an output buffer.)
+extern const char* const kCudnnConvForwardCallTarget;
+extern const char* const kCudnnConvBackwardInputCallTarget;
+extern const char* const kCudnnConvBackwardFilterCallTarget;
+
+// Returns true if `hlo` will be implemented as a call to a cuDNN convolution
+// routine.
+//
+// This returns true if `hlo` is a CustomCall HLO with a call target equal to
+// one of the kCudnnConvFoo constants above, but returns *false* for HLOs with a
+// kConvolution opcode.
+bool IsCustomCallToDnnConvolution(const HloInstruction& hlo);
+
+// Creates a CustomCall for a cudnn forward/backward-input/backward-filter conv.
+// Note that these CustomCalls return a tuple (conv_result, scratch_memory).  If
+// you want just the conv result, you'll need to get-tuple-element the value
+// returned by this function.
+//
+// The created cudnn call will use the default cudnn algorithm and no scratch
+// space.
+HloInstruction* CreateCudnnConvForward(
+    const Shape& shape, HloInstruction* input, HloInstruction* kernel,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+HloInstruction* CreateCudnnConvBackwardInput(
+    const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+HloInstruction* CreateCudnnConvBackwardFilter(
+    const Shape& shape, HloInstruction* input, HloInstruction* output,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+
 // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm
 // or cuDNN convolution.
 bool ImplementedAsLibraryCall(const HloInstruction& hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 23b72c3f71d..affd2ffa8ed 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -615,8 +615,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
 
 Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   // TODO(b/33011107): Support cross replica sum on GPU.
-  return Unimplemented(
-      "Cross replica sum not implemented on GPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on GPU.");
 }
 
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
@@ -710,11 +709,13 @@ Status IrEmitter::HandleCustomCall(HloInstruction*) {
 }
 
 Status IrEmitter::HandleInfeed(HloInstruction*) {
-  return Unimplemented("Infeed is not supported on GPU (b/30467474).");
+  // TODO(b/30467474): Implement infeed on GPU.
+  return Unimplemented("Infeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleOutfeed(HloInstruction*) {
-  return Unimplemented("Outfeed is not supported on GPU (b/34359662).");
+  // TODO(b/34359662): Implement outfeed on GPU.
+  return Unimplemented("Outfeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleRng(HloInstruction* random) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 3aa178410f0..9031a838f92 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -336,9 +336,6 @@ class IrEmitterUnnested : public IrEmitter {
   // Thunk object.
   std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
 
-  // Returns a ConvolutionThunk that calls DNN to implement `inst`.
-  std::unique_ptr<Thunk> BuildConvolutionThunk(const HloInstruction* inst);
-
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index fc8783e753d..a4847f6ca99 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
@@ -278,10 +279,6 @@ Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
 }
 
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
-  if (ImplementedAsDnnConvolution(*convolution)) {
-    thunk_sequence_->emplace_back(BuildConvolutionThunk(convolution));
-    return Status::OK();
-  }
   thunk_sequence_->emplace_back(BuildKernelThunk(convolution));
   return IrEmitter::HandleConvolution(convolution);
 }
@@ -380,6 +377,71 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     return Status::OK();
   }
 
+  if (IsCustomCallToDnnConvolution(*custom_call)) {
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    const auto& lhs_shape = custom_call->operand(0)->shape();
+    const auto& rhs_shape = custom_call->operand(1)->shape();
+    const auto& conv_result_shape = custom_call->shape().tuple_shapes(0);
+    auto lhs_slice = GetAllocationSlice(*custom_call->operand(0));
+    auto rhs_slice = GetAllocationSlice(*custom_call->operand(1));
+    auto tuple_result_slice = GetAllocationSlice(*custom_call);
+    auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
+    auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
+
+    const HloInstruction* algorithm_inst = custom_call->operand(2);
+    CHECK(algorithm_inst->IsConstant()) << algorithm_inst->ToString();
+    int64 algorithm = algorithm_inst->literal().Get<int64>({});
+
+    const auto& target = custom_call->custom_call_target();
+    std::unique_ptr<ConvolutionThunk> thunk;
+    if (target == kCudnnConvForwardCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kForward,
+          /*input_buffer=*/lhs_slice,
+          /*filter_buffer=*/rhs_slice,
+          /*output_buffer=*/conv_result_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/lhs_shape,
+          /*filter_shape=*/rhs_shape,
+          /*output_shape=*/conv_result_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, custom_call);
+    } else if (target == kCudnnConvBackwardInputCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kBackwardInput,
+          /*input_buffer=*/conv_result_slice,
+          /*filter_buffer=*/rhs_slice,
+          /*output_buffer=*/lhs_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/conv_result_shape,
+          /*filter_shape=*/rhs_shape,
+          /*output_shape=*/lhs_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, custom_call);
+    } else if (target == kCudnnConvBackwardFilterCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kBackwardFilter,
+          /*input_buffer=*/lhs_slice,
+          /*filter_buffer=*/conv_result_slice,
+          /*output_buffer=*/rhs_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/lhs_shape,
+          /*filter_shape=*/conv_result_shape,
+          /*output_shape=*/rhs_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, custom_call);
+    } else {
+      LOG(FATAL) << "Unexpected custom call target: "
+                 << custom_call->custom_call_target();
+    }
+
+    thunk_sequence_->emplace_back(std::move(thunk));
+    return Status::OK();
+  }
+
   return IrEmitter::HandleCustomCall(custom_call);
 }
 
@@ -500,10 +562,6 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
     return Status::OK();
   }
-  if (ImplementedAsDnnConvolution(*fusion)) {
-    thunk_sequence_->emplace_back(BuildConvolutionThunk(fusion));
-    return Status::OK();
-  }
   thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
   return IrEmitter::HandleFusion(fusion);
 }
@@ -1599,24 +1657,24 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 }
 
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(tuple->operands());
-  bool all_tuple_elements_have_buffer = std::all_of(
-      operands.begin(), operands.end(), [this](HloInstruction* tuple_element) {
+  bool all_tuple_elements_have_buffer =
+      c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
         return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation(
             tuple_element);
       });
-  // Tuples (especially output tuples) can take too many tuple elements,
-  // causing the kernel emitted exceeds the parameter space limit
-  // (b/31336476). As an optimization, if all tuple elements have a buffer, we
-  // collect their buffer addresses in a host array, and then copy that array
-  // to the tuple's buffer.
+  // Tuples (especially tuples that are the final result of a computation) can
+  // be so huge that if we were to emit a kernel that took each tuple element as
+  // a parameter, we would exceed the max allowable number of parameters to a
+  // GPU kernel, b/31336476. As an optimization, if all tuple elements have a
+  // buffer, we collect their buffer addresses in a host array, and then copy
+  // that array to the tuple's buffer.
   //
   // Some tuple elements (e.g. const or bitcast of const) might not have a
-  // buffer -- their contents are stored in code. In that case, we fall back
-  // to emitting kernels which have access to their buffer addresses in code.
+  // buffer -- their contents are stored in code. In that case, we fall back to
+  // emitting kernels which have access to their buffer addresses in code.
   if (all_tuple_elements_have_buffer) {
     std::vector<BufferAllocation::Slice> tuple_element_buffers;
-    for (const HloInstruction* tuple_element : operands) {
+    for (const HloInstruction* tuple_element : tuple->operands()) {
       tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element));
     }
     thunk_sequence_->emplace_back(MakeUnique<TupleThunk>(
@@ -1658,8 +1716,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on GPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter not implemented on GPU.");
   }
 
   // kSelectAndScatter is implemented as two kernel launches: the first launch
@@ -2012,52 +2069,6 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
   LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString();
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildConvolutionThunk(
-    const HloInstruction* inst) {
-  const HloInstruction* lhs = inst->operand(0);
-  const HloInstruction* rhs = inst->operand(1);
-  if (inst->opcode() == HloOpcode::kConvolution) {
-    // Forward covolution.
-    return MakeUnique<ConvolutionThunk>(
-        ConvolutionThunk::ConvolutionKind::kForward,
-        /*input_buffer=*/GetAllocationSlice(*lhs),
-        /*filter_buffer=*/GetAllocationSlice(*rhs),
-        /*output_buffer=*/GetAllocationSlice(*inst),
-        /*input_shape=*/lhs->shape(),
-        /*filter_shape=*/rhs->shape(),
-        /*output_shape=*/inst->shape(), inst->window(),
-        inst->convolution_dimension_numbers(), inst);
-  }
-
-  // Backward filter convolution, which takes the input (activations) and the
-  // gradients, and computes the filter.
-  CHECK_EQ(HloOpcode::kFusion, inst->opcode());
-  switch (inst->fusion_kind()) {
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return MakeUnique<ConvolutionThunk>(
-          ConvolutionThunk::ConvolutionKind::kBackwardFilter,
-          /*input_buffer=*/GetAllocationSlice(*lhs),
-          /*filter_buffer=*/GetAllocationSlice(*inst),
-          /*output_buffer=*/GetAllocationSlice(*rhs),
-          /*input_shape=*/lhs->shape(),
-          /*filter_shape=*/inst->shape(),
-          /*output_shape=*/rhs->shape(), inst->window(),
-          inst->convolution_dimension_numbers(), inst);
-    case HloInstruction::FusionKind::kConvBackwardInput:
-      return MakeUnique<ConvolutionThunk>(
-          ConvolutionThunk::ConvolutionKind::kBackwardInput,
-          /*input_buffer=*/GetAllocationSlice(*inst),
-          /*filter_buffer=*/GetAllocationSlice(*rhs),
-          /*output_buffer=*/GetAllocationSlice(*lhs),
-          /*input_shape=*/inst->shape(),
-          /*filter_shape=*/rhs->shape(),
-          /*output_shape=*/lhs->shape(), inst->window(),
-          inst->convolution_dimension_numbers(), inst);
-    default:
-      LOG(FATAL) << "Not a convolution-fusion";
-  }
-}
-
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
     const HloInstruction* inst) {
   const HloInstruction* operand = inst->operand(0);
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 2923a79af0a..25846dc6cd4 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -27,7 +27,7 @@ namespace gpu {
 
 namespace {
 bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
-  CHECK_EQ(HloOpcode::kConvolution, conv.opcode());
+  CHECK_EQ(conv.custom_call_target(), kCudnnConvForwardCallTarget);
   return window_util::HasSymmetricPadding(conv.window()) &&
          !window_util::HasNegativePadding(conv.window()) &&
          !window_util::HasDilation(conv.window());
@@ -47,6 +47,12 @@ HloInstruction* MaybePaddedAndSlicedInput(
       window_util::HasBaseDilation(conv_window)) {
     // If padding is uneven or has dilation, we insert a kPad instruction that
     // applies positive padding and dilation.
+    //
+    // TODO(phawkins): If conv_window has asymmetric padding, perhaps instead of
+    // moving all the padding into an explicit pad op, we should keep as much
+    // padding inside of cudnn as possible, on the assumption that padding
+    // within cudnn is basically free, whereas a kPad's cost increases as the
+    // amount of padding increases.
     PaddingConfig padding_config =
         MakeNoPaddingConfig(input->shape().dimensions_size());
     for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
@@ -167,14 +173,17 @@ bool PadInsertion::CanonicalizeForwardConvolution(HloInstruction* conv) {
     dim->set_window_dilation(1);
   }
 
+  // The conv CustomCall returns a tuple (conv_result, scratch_buffer).  Extract
+  // out the shape of conv_result.
+  Shape old_conv_shape = conv->shape().tuple_shapes(0);
+
   VLOG(1) << "Canonicalizing forward conv";
-  auto new_conv = HloInstruction::CreateConvolve(
-      conv->shape(), new_input, new_kernel, new_conv_window,
-      conv->convolution_dimension_numbers());
+  auto new_conv = CreateCudnnConvForward(old_conv_shape, new_input, new_kernel,
+                                         new_conv_window,
+                                         conv->convolution_dimension_numbers());
   VLOG(1) << "Replacing:\n  " << conv->ToString() << "\nwith:\n  "
           << new_conv->ToString();
-  TF_CHECK_OK(
-      conv->parent()->ReplaceWithNewInstruction(conv, std::move(new_conv)));
+  TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv));
   return true;
 }
 
@@ -190,6 +199,8 @@ void IncreasePaddingHighBy(int64 delta, WindowDimension* window_dim) {
 
 bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     HloInstruction* backward_conv) {
+  CHECK_EQ(backward_conv->custom_call_target(),
+           kCudnnConvBackwardFilterCallTarget);
   if (window_util::HasSymmetricPadding(backward_conv->window())) {
     return false;
   }
@@ -202,15 +213,11 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
   //   ABCD0 = Pad(ABCD, padding_high=1)
   //   BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1)
   // We choose the lesser of padding_low and padding_high as the new padding.
-  HloInstruction* forward_conv = backward_conv->fused_expression_root();
   HloInstruction* input = backward_conv->mutable_operand(0);
-  Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
   // input_padding_config is the config of the kPad to be inserted.
   PaddingConfig input_padding_config =
       MakeNoPaddingConfig(ShapeUtil::Rank(input->shape()));
-  ConvolutionDimensionNumbers forward_conv_dnums =
-      forward_conv->convolution_dimension_numbers();
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
@@ -222,11 +229,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
       // cuDNN convolution (which doesn't support negative padding) to fail.
       return false;
     }
-    // If the backward convolution has uneven padding on the activations, we
-    // move some padding on the larger end to "internal" padding, so that the
-    // backward convolution produces larger weight gradients which get sliced
-    // later. Therefore, the amount of new padding (low or high) is the minimum
-    // of the amount of old padding low and old padding high.
+    // Compute the new, even padding for the backward conv operation.
     int64 new_conv_padding = std::min(padding_low, padding_high);
     int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     input_padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -237,14 +240,9 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     // Since we move some padding from the backward convolution to the kPad, we
     // need to accordingly reduce the padding amount of the backward convolution
     // and its inner forward convolution.
-    IncreasePaddingLowBy(-(padding_low - new_conv_padding),
-                         new_backward_conv_window.mutable_dimensions(i));
-    IncreasePaddingHighBy(-(padding_high - new_conv_padding),
-                          new_backward_conv_window.mutable_dimensions(i));
-    IncreasePaddingLowBy(-(padding_low - new_conv_padding),
-                         new_forward_conv_window.mutable_dimensions(i));
-    IncreasePaddingHighBy(-(padding_high - new_conv_padding),
-                          new_forward_conv_window.mutable_dimensions(i));
+    auto* new_dim = new_backward_conv_window.mutable_dimensions(i);
+    new_dim->set_padding_low(new_conv_padding);
+    new_dim->set_padding_high(new_conv_padding);
   }
 
   // Create a new backward convolution replacing the old one.
@@ -260,19 +258,12 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
               .ConsumeValueOrDie(),
           input, padding, input_padding_config));
 
-  HloInstruction* new_forward_conv =
-      computation->AddInstruction(HloInstruction::CreateConvolve(
-          ShapeInference::InferConvolveShape(
-              padded_input->shape(), output->shape(), new_forward_conv_window,
-              forward_conv_dnums)
-              .ConsumeValueOrDie(),
-          padded_input, output, new_forward_conv_window, forward_conv_dnums));
-
-  // Fuse the new forward convolution to the new backward convolution.
-  HloInstruction* new_backward_conv =
-      computation->CreateFusionInstructionForBackwardConvolution(
-          {new_forward_conv}, HloInstruction::FusionKind::kConvBackwardFilter,
-          new_backward_conv_window, backward_conv_dnums);
+  // The shape of the backward_conv CustomCall is a tuple (conv_result,
+  // scratch_buffer).  Extract out the shape of conv_result.
+  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
+  HloInstruction* new_backward_conv = CreateCudnnConvBackwardFilter(
+      backward_conv_shape, padded_input, output, new_backward_conv_window,
+      backward_conv_dnums);
 
   VLOG(1) << "Canonicalizing backward filter conv";
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
@@ -289,14 +280,15 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
     return false;
   }
 
-  HloInstruction* forward_conv = backward_conv->fused_expression_root();
-  HloInstruction* reverse_filter = forward_conv->mutable_operand(1);
-  Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
-  ConvolutionDimensionNumbers forward_conv_dnums =
-      forward_conv->convolution_dimension_numbers();
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
+
+  // The backward_conv CustomCall returns a tuple (conv_result, scratch_memory).
+  // Get the shape of conv_result.
+  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
+
+  Shape new_backward_conv_shape = backward_conv_shape;
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
@@ -315,41 +307,38 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
     // where the amount of padding low is larger, we can canonicalize it to
     //   [B A] = BackwardInputConvolve([a b], [x y z], padding=(low=1,high=1))
     //   [A] = Slice([B A])
-    // For consistency, we need to increase the low padding of the inner
-    // convolution by 1 as well because the input is larger now.
     if (padding_low > padding_high) {
       IncreasePaddingLowBy(padding_high - padding_low,
                            new_backward_conv_window.mutable_dimensions(i));
-      IncreasePaddingLowBy(padding_low - padding_high,
-                           new_forward_conv_window.mutable_dimensions(i));
     } else if (padding_low < padding_high) {
       IncreasePaddingHighBy(padding_low - padding_high,
                             new_backward_conv_window.mutable_dimensions(i));
-      IncreasePaddingHighBy(padding_high - padding_low,
-                            new_forward_conv_window.mutable_dimensions(i));
     }
+    // Decreasing the padding by X *increases* the size of our output by X.
+    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    new_backward_conv_shape.set_dimensions(
+        dim, new_backward_conv_shape.dimensions(dim) +
+                 std::abs(padding_low - padding_high));
   }
 
   // Create a new backward convolution replacing the old one.
   HloComputation* computation = backward_conv->parent();
   HloInstruction* output = backward_conv->mutable_operand(0);
   HloInstruction* filter = backward_conv->mutable_operand(1);
-  HloInstruction* new_reverse_filter =
-      computation->AddInstruction(HloInstruction::CreateReverse(
-          filter->shape(), filter, reverse_filter->dimensions()));
-  HloInstruction* new_forward_conv =
-      computation->AddInstruction(HloInstruction::CreateConvolve(
-          ShapeInference::InferConvolveShape(
-              output->shape(), new_reverse_filter->shape(),
-              new_forward_conv_window, forward_conv_dnums)
-              .ConsumeValueOrDie(),
-          output, new_reverse_filter, new_forward_conv_window,
-          forward_conv_dnums));
+
+  HloInstruction* new_backward_conv_call = CreateCudnnConvBackwardInput(
+      new_backward_conv_shape, output, filter, new_backward_conv_window,
+      backward_conv_dnums);
+
+  // The CustomCall created above returns a tuple (conv_result, scratch_memory).
+  // Extract out the two elements.
   HloInstruction* new_backward_conv =
-      computation->CreateFusionInstructionForBackwardConvolution(
-          {new_forward_conv, new_reverse_filter},
-          HloInstruction::FusionKind::kConvBackwardInput,
-          new_backward_conv_window, backward_conv_dnums);
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_shape, new_backward_conv_call, 0));
+  HloInstruction* new_backward_conv_scratch =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_call->shape().tuple_shapes(1),
+          new_backward_conv_call, 1));
 
   // Slice the new backward convolution.
   //
@@ -377,22 +366,25 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   }
 
   // Replace the old backward convolution with the slice.
-  CHECK(ShapeUtil::Compatible(
+  Shape slice_shape =
       ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
                                       limit_indices, strides)
-          .ConsumeValueOrDie(),
-      backward_conv->shape()));
+          .ConsumeValueOrDie();
+  CHECK(ShapeUtil::Compatible(slice_shape, backward_conv_shape))
+      << ShapeUtil::HumanString(slice_shape) << " vs "
+      << ShapeUtil::HumanString(backward_conv_shape);
 
-  auto slice =
-      HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv,
-                                  start_indices, limit_indices, strides);
+  HloInstruction* slice = computation->AddInstruction(
+      HloInstruction::CreateSlice(backward_conv_shape, new_backward_conv,
+                                  start_indices, limit_indices, strides));
+  HloInstruction* new_tuple = computation->AddInstruction(
+      HloInstruction::CreateTuple({slice, new_backward_conv_scratch}));
 
   VLOG(1) << "Canonicalizing backward input conv";
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
-          << slice->ToString();
+          << new_tuple->ToString();
 
-  TF_CHECK_OK(
-      computation->ReplaceWithNewInstruction(backward_conv, std::move(slice)));
+  TF_CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple));
   return true;
 }
 
@@ -400,18 +392,17 @@ StatusOr<bool> PadInsertion::Run(HloModule* module) {
   bool changed = false;
   for (HloInstruction* instruction :
        module->entry_computation()->MakeInstructionPostOrder()) {
-    if (instruction->opcode() == HloOpcode::kConvolution) {
-      changed |= CanonicalizeForwardConvolution(instruction);
-    } else if (instruction->opcode() == HloOpcode::kFusion) {
-      switch (instruction->fusion_kind()) {
-        case HloInstruction::FusionKind::kConvBackwardFilter:
-          changed |= CanonicalizeBackwardFilterConvolution(instruction);
-          break;
-        case HloInstruction::FusionKind::kConvBackwardInput:
-          changed |= CanonicalizeBackwardInputConvolution(instruction);
-          break;
-        default:
-          break;
+    if (IsCustomCallToDnnConvolution(*instruction)) {
+      const auto& target = instruction->custom_call_target();
+      if (target == kCudnnConvForwardCallTarget) {
+        changed |= CanonicalizeForwardConvolution(instruction);
+      } else if (target == kCudnnConvBackwardFilterCallTarget) {
+        changed |= CanonicalizeBackwardFilterConvolution(instruction);
+      } else if (target == kCudnnConvBackwardInputCallTarget) {
+        changed |= CanonicalizeBackwardInputConvolution(instruction);
+      } else {
+        LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+                   << instruction->ToString();
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 934e7e1919f..8ed63a854a7 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -42,6 +42,11 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
                       const LaunchDimensions& launch_dimensions,
                       llvm::IRBuilder<>* ir_builder);
 
+  // Constructs a loop emitter for a loop that generates on element of each of N
+  // arrays on each iteration.
+  //
+  // This is used in multi-output fusion.  target_element_generator should
+  // produce a struct with N elements, one for each of target_arrays.
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 34e2f7ee206..cde5877e29f 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -64,10 +64,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
-                     &module_sequence);
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options, &module_sequence);
   const HloComputation* entry_computation = module.entry_computation();
   const std::vector<const HloInstruction*>& instruction_sequence =
       FindOrDie(module_sequence, entry_computation);
@@ -81,9 +79,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*module_sequence=*/nullptr);
   TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
                                          points_to_analysis));
@@ -199,15 +196,17 @@ Status HeapSimulator::RunComputation(
       // We can only share with the operand buffer if it is about to be freed;
       // we must be the last user of the buffer.
       bool shared = false;
-      for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
-        if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
-            buffer->instruction()->opcode() != HloOpcode::kCopy &&
-            CanShareOperandBufferWithUser(
-                operand_buffer->instruction(), operand_buffer->index(),
-                buffer->instruction(), buffer->index(), points_to_analysis)) {
-          ShareBuffer(buffer, operand_buffer, instruction);
-          shared = true;
-          break;
+      if (options_.may_reuse_operand_buffers) {
+        for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
+          if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
+              buffer->instruction()->opcode() != HloOpcode::kCopy &&
+              CanShareOperandBufferWithUser(
+                  operand_buffer->instruction(), operand_buffer->index(),
+                  buffer->instruction(), buffer->index(), points_to_analysis)) {
+            ShareBuffer(buffer, operand_buffer, instruction);
+            shared = true;
+            break;
+          }
         }
       }
 
@@ -266,13 +265,12 @@ Status HeapSimulator::RunComputation(
 
 HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options,
     const SequentialHloOrdering::HloModuleSequence* module_sequence)
     : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
-      buffers_to_assign_(buffers_to_assign),
+      options_(options),
       module_sequence_(module_sequence) {
   debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr);
 }
@@ -280,13 +278,16 @@ HeapSimulator::HeapSimulator(
 HeapSimulator::~HeapSimulator() {}
 
 bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
-  // Buffers for constants are ignored, as with BufferAssigner.  Also ignore
-  // buffers that we're not meant to assign.
+  // Buffers for constants are ignored unless the alloc_constants option is
+  // set. Also ignore buffers that we're not meant to assign.
   //
   // TODO(b/32248867): For consistency, constants should get allocations.
-  return buffer->instruction()->opcode() == HloOpcode::kConstant ||
-         (buffers_to_assign_ != nullptr &&
-          buffers_to_assign_->count(buffer) == 0);
+  if (!options_.alloc_constants &&
+      buffer->instruction()->opcode() == HloOpcode::kConstant) {
+    return true;
+  }
+  return options_.buffers_to_assign != nullptr &&
+         options_.buffers_to_assign->count(buffer) == 0;
 }
 
 // Alloc always calls the underlying heap algorithm.
@@ -400,8 +401,8 @@ HeapSimulator::Result HeapSimulator::Finish() {
     }
     // If we were told to assign specific buffers, make sure we've assigned
     // exactly that many buffers.
-    if (buffers_to_assign_ != nullptr) {
-      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    if (options_.buffers_to_assign != nullptr) {
+      CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 88a8698d161..636f19dd39f 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -67,6 +67,23 @@ class HeapSimulator {
     HeapSimulatorTrace debug_trace;
   };
 
+  // The different options to be passed to the Run() APIs.
+  struct Options {
+    Options()
+        : may_reuse_operand_buffers(true),
+          alloc_constants(false),
+          buffers_to_assign(nullptr) {}
+
+    // Whether a buffer about to be Free()-ed, can be recycled for a new born
+    // one, hence collapsing Free()+Alloc() calls (default true).
+    bool may_reuse_operand_buffers;
+    // Whether to issue Alloc() and Free() calls for constants (default false).
+    bool alloc_constants;
+    // If 'buffers_to_assign' is provided, only those buffers are assigned
+    // offsets, otherwise all buffers defined by the instructions are assigned.
+    const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign;
+  };
+
   // Run the heap simulation with the given algorithm, assuming the given
   // module_sequence, which must contain a topologically-consistent total
   // ordering of all instructions within each computation. The result is invalid
@@ -76,15 +93,12 @@ class HeapSimulator {
   // to running on a per-computation basis, since we can re-use buffer space for
   // called sub-computations.
   //
-  // If 'buffers_to_assign' is provided, only those buffers are assigned
-  // offsets, otherwise all buffers defined by the instructions are assigned.
   static StatusOr<Result> Run(
       std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
       const SequentialHloOrdering::HloModuleSequence& module_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
   // must contain a topologically-consistent total ordering of all instructions
@@ -96,8 +110,7 @@ class HeapSimulator {
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
  private:
   // If 'module_sequence' is non-null, it is used to find kCall and kWhile
@@ -105,8 +118,7 @@ class HeapSimulator {
   // be run recursively. I.e. the simulation is run over the whole module.
   HeapSimulator(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign,
+      const LogicalBuffer::SizeFunction& size_fn, const Options& options,
       const SequentialHloOrdering::HloModuleSequence* module_sequence);
   ~HeapSimulator();
 
@@ -130,7 +142,7 @@ class HeapSimulator {
   const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
   const std::unique_ptr<HeapAlgorithm> algorithm_;
   const LogicalBuffer::SizeFunction size_fn_;
-  const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign_;
+  const Options options_;
   const SequentialHloOrdering::HloModuleSequence* module_sequence_;
 
   // In addition to Alloc and Free, the heap simulator exposes a concept of
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index a63affa06ca..5432419e4a2 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -461,20 +461,6 @@ HloInstruction* HloComputation::CreateFusionInstruction(
   return fusion_instruction;
 }
 
-HloInstruction* HloComputation::CreateFusionInstructionForBackwardConvolution(
-    tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
-    HloInstruction::FusionKind fusion_kind, const Window& window,
-    const ConvolutionDimensionNumbers& conv_dnums) {
-  CHECK(HloInstruction::FusionKind::kConvBackwardFilter == fusion_kind ||
-        HloInstruction::FusionKind::kConvBackwardInput == fusion_kind);
-  HloInstruction* root = instructions_to_fuse.front();
-  HloInstruction* fusion_instruction =
-      AddInstruction(HloInstruction::CreateFusionForBackwardConvolution(
-          root->shape(), fusion_kind, window, conv_dnums, root));
-  FuseInstructionsInto(instructions_to_fuse, fusion_instruction);
-  return fusion_instruction;
-}
-
 StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy,
     ShapeTree<HloInstruction*>* copies_added, ShapeIndex* index) {
@@ -577,8 +563,11 @@ Status HloComputation::ReplaceWithNewInstruction(
 
 Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
                                           HloInstruction* new_instruction) {
-  TF_RET_CHECK(ShapeUtil::Compatible(old_instruction->shape(),
-                                     new_instruction->shape()));
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(old_instruction->shape(), new_instruction->shape()))
+      << ShapeUtil::HumanString(old_instruction->shape()) << " vs "
+      << ShapeUtil::HumanString(new_instruction->shape());
+
   VLOG(10) << "transformed " << old_instruction->ToString() << " to "
            << new_instruction->ToString();
   // Try to add metadata for HLO instructions that are created to replace
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 6436815f910..061c59abe5e 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -224,15 +224,6 @@ class HloComputation {
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
       HloInstruction::FusionKind fusion_kind);
 
-  // Creates a fusion instruction that represents a backward convolution. This
-  // is similar to CreateFusionInstruction but takes window and conv_dnums which
-  // indicate the window and convolution dimension numbers of the backward
-  // convolution.
-  HloInstruction* CreateFusionInstructionForBackwardConvolution(
-      tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
-      HloInstruction::FusionKind fusion_kind, const Window& window,
-      const ConvolutionDimensionNumbers& conv_dnums);
-
   // Create a deep copy of the given instruction and return the instruction
   // producing the copied result. All instructions performing the copy are added
   // to the computation. For array-shaped values, this method trivially returns
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index cd54eb74d18..9cd5a1e2b71 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -469,7 +469,13 @@ Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
 }
 
 Status HloCostAnalysis::HandleCustomCall(const HloInstruction*) {
-  return Unimplemented("Custom-call is not implemented for HLO cost analysis.");
+  // We can't do anything sane with CustomCalls, since we don't know what they
+  // do, and returning an error status will stop iteration over this
+  // computation, which is probably also not what we want.  So just punt and
+  // return OK.  This will cause all of the properties to be reported as 0,
+  // which is fine.
+  current_should_compute_bottleneck_time_ = false;
+  return Status::OK();
 }
 
 Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index e3f5c17e35f..ab604064d5e 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -1706,6 +1707,115 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleCos<ElementwiseT>(cos);
   }
 
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  float, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[reduce_precision],
+        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
+                                                 ElementwiseT elem) {
+          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
+          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
+          const uint32_t exponent_bits = reduce_precision->exponent_bits();
+
+          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
+          //
+          // Bits in float type:
+          //   mantissa : bits [0:22]
+          //   exponent : bits [23:30]
+          //   sign     : bits [31]
+          if (mantissa_bits < 23) {
+            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+            // Compute rounding bias for round-to-nearest with ties to even.
+            // This is equal to a base value of 0111... plus one bit if the last
+            // remaining mantissa bit is 1.
+            const uint32_t base_rounding_bias =
+                (last_mantissa_bit_mask >> 1) - 1;
+            const uint32_t x_last_mantissa_bit =
+                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
+            const uint32_t x_rounding_bias =
+                x_last_mantissa_bit + base_rounding_bias;
+
+            // Add rounding bias, and mask out truncated bits.  Note that the
+            // case where adding the rounding bias overflows into the exponent
+            // bits is correct; the non-masked mantissa bits will all be zero,
+            // and the exponent will be incremented by one.
+            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+            value_as_int = value_as_int + x_rounding_bias;
+            value_as_int = value_as_int & truncation_mask;
+          }
+          if (exponent_bits < 8) {
+            // Masks for f32 values.
+            const uint32_t f32_sign_bit_mask = 1u << 31;
+            const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
+            // most- significant bit -- is equal to 1.0f for all exponent sizes.
+            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
+            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
+            // this gives us the lowest' exponent (corresponding to 0.0f).
+            //
+            // Thus, the f32 exponent corresponding to the highest non-infinite
+            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+            // exponent corresponding to the lowest exponent for a bit size of n
+            // is (2^7-1) - 2^(n-1)-1.
+            //
+            // Note that we have already checked that exponents_bits >= 1.
+            const uint32_t f32_exponent_bias = (1 << 7) - 1;
+            const uint32_t reduced_exponent_bias =
+                (1 << (exponent_bits - 1)) - 1;
+            const uint32_t reduced_max_exponent =
+                f32_exponent_bias + reduced_exponent_bias;
+            const uint32_t reduced_min_exponent =
+                f32_exponent_bias - reduced_exponent_bias;
+
+            // Do we overflow or underflow?
+            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
+            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
+            const bool x_underflows =
+                x_exponent <= (reduced_min_exponent << 23);
+
+            // Compute appropriately-signed values of zero and infinity.
+            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
+            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
+
+            // Force to zero or infinity if overflow or underflow.  (Note that
+            // this truncates all denormal values to zero, rather than rounding
+            // them.)
+            value_as_int = x_overflows ? x_signed_inf : value_as_int;
+            value_as_int = x_underflows ? x_signed_zero : value_as_int;
+          }
+
+          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
+          if (std::isnan(elem)) {
+            reduced_result = mantissa_bits > 0
+                                 ? elem
+                                 : std::numeric_limits<float>::infinity();
+          }
+          return reduced_result;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  double, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Double not supported for reduce precision");
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Unsupported type for reduce precision");
+  }
+
+  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
+    return HandleReducePrecision<ElementwiseT>(reduce_precision);
+  }
+
  private:
   template <typename IndexT>
   StatusOr<std::unique_ptr<Literal>> DynamicSlice(
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index f7c6435002d..44fcd36370d 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1063,14 +1063,19 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   // node -- there the shape and layout is present in the output node.
   if (instr->opcode() != HloOpcode::kFusion ||
       !ShouldShowFusionSubcomputation(instr)) {
-    string instr_shape = ShapeUtil::HumanString(instr->shape());
-
-    // Show layout of non-tuple shapes with more than one dimension.
-    if (LayoutUtil::HasLayout(instr->shape()) &&
-        instr->shape().dimensions_size() > 1 &&
-        !ShapeUtil::IsTuple(instr->shape())) {
-      StrAppend(&instr_shape, "{",
-                Join(LayoutUtil::MinorToMajor(instr->shape()), ","), "}");
+    // Show layout of instructions with more than one dimension.  Don't show
+    // layout on tuples or tensors with just one dimension (which only have one
+    // possible layout) to avoid visual noise.
+    bool shape_is_multidim = false;
+    ShapeUtil::ForEachSubshape(instr->shape(),
+                               [&](const Shape& s, const ShapeIndex&) {
+                                 shape_is_multidim |= s.dimensions_size() > 1;
+                               });
+    string instr_shape;
+    if (instr->opcode() != HloOpcode::kTuple && shape_is_multidim) {
+      instr_shape = ShapeUtil::HumanStringWithLayout(instr->shape());
+    } else {
+      instr_shape = ShapeUtil::HumanString(instr->shape());
     }
 
     // Some instructions have giant tuples as their shapes, so truncate the
@@ -1421,9 +1426,11 @@ void DumpText(const HloModule& module, const string& label,
 
 string MaybeDumpHloModule(const HloModule& module, const string& label,
                           const HloExecutionProfile* profile) {
-  VLOG(2) << "MaybeDumpHloModule called on module " << module.name();
-  string graph_url;
   const DebugOptions& debug_options = module.config().debug_options();
+  VLOG(2) << "MaybeDumpHloModule called on module " << module.name()
+          << " with generate_hlo_graph regex \""
+          << debug_options.xla_generate_hlo_graph() << "\"";
+  string graph_url;
   if (!debug_options.xla_generate_hlo_graph().empty() &&
       RE2::PartialMatch(module.name(),
                         debug_options.xla_generate_hlo_graph())) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a889c35aeb2..fac6b434054 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -763,16 +763,13 @@ HloInstruction::CreateBroadcastSequence(
   return instruction;
 }
 
-// We put the fusion kind into the instruction's name for transpose-dot and
-// backward-conv fusions, since those fusions are really just describing a type
-// of dot/conv rather than generating a novel computation.
+// We put the fusion kind into the instruction's name for transpose-dot fusions,
+// since those fusions are really just describing a type of dot rather than
+// generating a novel computation.
 static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
   switch (fusion_kind) {
     case HloInstruction::FusionKind::kTransposeDot:
       return "dot_fusion";
-    case HloInstruction::FusionKind::kConvBackwardInput:
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return "conv_fusion";
     default:
       return "fusion";
   }
@@ -804,18 +801,6 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
   return instruction;
 }
 
-/* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateFusionForBackwardConvolution(
-    const Shape& shape, FusionKind fusion_kind, const Window& window,
-    const ConvolutionDimensionNumbers& conv_dnums, HloInstruction* fused_root) {
-  std::unique_ptr<HloInstruction> fusion =
-      CreateFusion(shape, fusion_kind, fused_root);
-  fusion->window_ = MakeUnique<Window>(window);
-  fusion->convolution_dimension_numbers_ =
-      MakeUnique<ConvolutionDimensionNumbers>(conv_dnums);
-  return fusion;
-}
-
 void HloInstruction::MergeFusionInstruction(
     HloInstruction* instruction_to_merge) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
@@ -2318,7 +2303,7 @@ string HloInstruction::ToCategory() const {
     return "data formatting";
   }
 
-  auto conv_category = [&] {
+  if (opcode() == HloOpcode::kConvolution) {
     string category = "convolution";
     if (window_util::HasBaseDilation(window())) {
       category += " base-dilated";
@@ -2327,10 +2312,6 @@ string HloInstruction::ToCategory() const {
       category += " window-dilated";
     }
     return category;
-  };
-
-  if (opcode() == HloOpcode::kConvolution) {
-    return conv_category();
   }
 
   // Give transpose-dot and backwards-conv fusions the categories "dot" and
@@ -2348,9 +2329,6 @@ string HloInstruction::ToCategory() const {
         return "output fusion";
       case FusionKind::kTransposeDot:
         return "dot";
-      case FusionKind::kConvBackwardFilter:
-      case FusionKind::kConvBackwardInput:
-        return conv_category();
       case FusionKind::kCustom:
         return "custom fusion";
     }
@@ -3125,10 +3103,6 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
       return "kTransposeDot";
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return "kConvBackwardFilter";
-    case HloInstruction::FusionKind::kConvBackwardInput:
-      return "kConvBackwardInput";
     case HloInstruction::FusionKind::kCustom:
       return "kCustom";
   }
@@ -3148,12 +3122,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   if (kind_name == "kTransposeDot") {
     return HloInstruction::FusionKind::kTransposeDot;
   }
-  if (kind_name == "kConvBackwardFilter") {
-    return HloInstruction::FusionKind::kConvBackwardFilter;
-  }
-  if (kind_name == "kConvBackwardInput") {
-    return HloInstruction::FusionKind::kConvBackwardInput;
-  }
   if (kind_name == "kCustom") {
     return HloInstruction::FusionKind::kCustom;
   }
@@ -3261,7 +3229,13 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   result += "_";
   append_dims(rhs_dims, operand(1)->shape());
   result += "->";
-  append_dims(output_dims, shape());
+
+  // A convolution can be represented as a kConvolution HLO or as a CustomCall
+  // that returns a tuple, the first element of which is the result of the
+  // convolution.
+  Shape this_shape =
+      ShapeUtil::IsTuple(shape()) ? shape().tuple_shapes(0) : shape();
+  append_dims(output_dims, this_shape);
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5e89dc79bea..bce9ebdda84 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -162,17 +162,14 @@ class HloPrintOptions {
 class HloInstruction {
  public:
   enum class FusionKind {
-    kLoop,                // Fused into a loop.
-    kInput,               // Op's input is fused into the op itself.
-    kOutput,              // Op's output is fused into the op itself.
-                          // REQUIRES: At least one operand buffer must be able
-                          // to alias the output buffer.
-    kTransposeDot,        // Fused into a dot with transposed operands.
-    kConvBackwardFilter,  // Fused into a backward filter convolution.
-    kConvBackwardInput,   // Fused into a backward input convolution.
-
-    kCustom,  // Custom category for backend-specific fusions that
-              // do not match any of the more specific ones.
+    kLoop,          // Fused into a loop.
+    kInput,         // Op's input is fused into the op itself.
+    kOutput,        // Op's output is fused into the op itself.
+                    // REQUIRES: At least one operand buffer must be able
+                    // to alias the output buffer.
+    kTransposeDot,  // Fused into a dot with transposed operands.
+    kCustom,        // Custom category for backend-specific fusions that
+                    // do not match any of the more specific ones.
   };
 
   ~HloInstruction();
@@ -466,14 +463,6 @@ class HloInstruction {
       tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloComputation* fusion_computation);
 
-  // Creates a fusion instruction that represents backward convolution. This is
-  // similar to CreateFusion, but with extra arguments indicating the window and
-  // dimemsion mapping of the backward convolution.
-  static std::unique_ptr<HloInstruction> CreateFusionForBackwardConvolution(
-      const Shape& shape, FusionKind fusion_kind, const Window& window,
-      const ConvolutionDimensionNumbers& conv_dnums,
-      HloInstruction* fused_root);
-
   // Creates a call instruction that applies the given computation on the given
   // operands. "shape" is the resultant shape.
   static std::unique_ptr<HloInstruction> CreateCall(
@@ -885,8 +874,8 @@ class HloInstruction {
   // Returns true if this instruction is a fusion instruction that generates
   // multiple outputs.
   const bool IsMultiOutputFusion() const {
-    return (opcode() == HloOpcode::kFusion &&
-            fused_expression_root()->opcode() == HloOpcode::kTuple);
+    return opcode() == HloOpcode::kFusion &&
+           fused_expression_root()->opcode() == HloOpcode::kTuple;
   }
 
   FusionKind fusion_kind() const {
@@ -1052,13 +1041,23 @@ class HloInstruction {
     return *padding_config_;
   }
 
-  // Returns data on the dimension numbers used for a convolution
-  // operation.
+  // Returns data on the dimension numbers used for a convolution operation,
+  // which may be a kConvolution instruction or a kCustomCall that implements a
+  // convolution.
   const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
     CHECK(convolution_dimension_numbers_ != nullptr);
     return *convolution_dimension_numbers_;
   }
 
+  // Sets the convolution dimension numbers on this instruction.  In general you
+  // shouldn't need to call this; instead, specify the convolution dimension
+  // numbers when you create the instruction.
+  void set_convolution_dimension_numbers(
+      const ConvolutionDimensionNumbers& dnums) {
+    convolution_dimension_numbers_ =
+        MakeUnique<ConvolutionDimensionNumbers>(dnums);
+  }
+
   FftType fft_type() const {
     CHECK_EQ(HloOpcode::kFft, opcode_);
     return fft_type_;
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 4255d608662..bc74c4bc10c 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -102,6 +102,36 @@ bool HloGetTupleElementMatcher::MatchAndExplain(
   return true;
 }
 
+void HloCustomCallMatcher::DescribeTo(std::ostream* os) const {
+  HloMatcher::DescribeTo(os);
+  *os << " with call target that ";
+  call_target_matcher_.DescribeTo(os);
+}
+
+bool HloCustomCallMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  ::testing::StringMatchResultListener sub_listener;
+  bool result = ExplainMatchResult(
+      call_target_matcher_, instruction->custom_call_target(), &sub_listener);
+  if (sub_listener.str().empty()) {
+    sub_listener << " that ";
+
+    std::stringstream desc_stream;
+    if (result) {
+      call_target_matcher_.DescribeTo(&desc_stream);
+    } else {
+      call_target_matcher_.DescribeNegationTo(&desc_stream);
+    }
+    sub_listener << desc_stream.str();
+  }
+  *listener << "custom-call with call target" << sub_listener.str();
+  return result;
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 9206cdac05f..103f04a2cb7 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -56,8 +56,8 @@ class HloParameterMatcher : public HloMatcher {
 // index to match.
 class HloGetTupleElementMatcher : public HloMatcher {
  public:
-  explicit HloGetTupleElementMatcher(
-      ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index)
+  HloGetTupleElementMatcher(::testing::Matcher<const HloInstruction*> operand,
+                            int64 tuple_index)
       : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
         tuple_index_(tuple_index) {}
 
@@ -68,6 +68,24 @@ class HloGetTupleElementMatcher : public HloMatcher {
   int64 tuple_index_;
 };
 
+// Custom matcher for custom-call instructions, which accepts a matcher for its
+// call target.
+class HloCustomCallMatcher : public HloMatcher {
+ public:
+  HloCustomCallMatcher(
+      ::testing::Matcher<string> call_target_matcher,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCustomCall, operands),
+        call_target_matcher_(call_target_matcher) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  ::testing::Matcher<string> call_target_matcher_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -94,7 +112,6 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
-HLO_MATCHER(CustomCall);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
@@ -184,6 +201,36 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
       new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
 }
 
+// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call
+//   target T and the given operands.
+//
+// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the
+//   given operands.
+//
+// - CustomCall() matches any CustomCall HLO at all.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    ::testing::Matcher<string> call_target_matcher, M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher(
+      call_target_matcher, {operands...}));
+}
+// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to
+// ::testing::Matcher<string>.  In that case, we want to prefer the overload
+// above.
+template <typename FirstM, typename... M,
+          typename Dummy = typename std::enable_if<
+              !std::is_convertible<FirstM, ::testing::Matcher<string>>::value,
+              void>::type*>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    FirstM operands_first, M... operands_rest) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      HloOpcode::kCustomCall, {operands_first, operands_rest...}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1465d1cacdc..1c21703a45e 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -23,6 +23,12 @@ using ::testing::Eq;
 namespace xla {
 namespace {
 
+string DescribeHloMatcher(const ::testing::Matcher<const HloInstruction*>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
 template <typename M, typename T>
 string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
@@ -67,5 +73,32 @@ TEST(HloMatchersTest, Test) {
          "add"));
 }
 
+TEST(HloMatchersTest, CustomCallMatcher) {
+  auto c1 = HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3}));
+  auto c2 = HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 3}));
+  auto call = HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target");
+
+  EXPECT_THAT(call.get(), op::CustomCall());
+  EXPECT_THAT(call.get(), op::CustomCall(c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target"));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target", c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall(::testing::StartsWith("foo")));
+  EXPECT_THAT(call.get(),
+              op::CustomCall(::testing::Not(::testing::StartsWith("bar"))));
+
+  // Wrong number of operands.
+  EXPECT_THAT(call.get(), ::testing::Not(op::CustomCall(c1.get())));
+
+  // Call target does not match.
+  EXPECT_THAT(call.get(),
+              ::testing::Not(op::CustomCall(::testing::StartsWith("bar"))));
+
+  EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")),
+              R"(custom-call with call target that isn't equal to "bar")");
+  EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")),
+              R"(custom-call with call target that is equal to "foo_target")");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 99d8dd04e52..60270b0595d 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -38,12 +38,16 @@ HloModule::HloModule(const string& name,
     : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
       has_entry_computation_handle_(true),
-      entry_computation_handle_(entry_computation_handle) {}
+      entry_computation_handle_(entry_computation_handle),
+      unique_id_(next_unique_module_id_++) {}
 
 HloModule::HloModule(const string& name)
-    : name_(NameUniquer::GetSanitizedName(name)) {}
+    : name_(NameUniquer::GetSanitizedName(name)),
+      unique_id_(next_unique_module_id_++) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
-    : name_(NameUniquer::GetSanitizedName(name)), config_(config) {}
+    : name_(NameUniquer::GetSanitizedName(name)),
+      config_(config),
+      unique_id_(next_unique_module_id_++) {}
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -564,4 +568,6 @@ uint64 HloModule::RandomNew64() const {
   return rng_();
 }
 
+/* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index e377654d024..4bfe8d89ce0 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 
+#include <atomic>
 #include <list>
 #include <memory>
 #include <random>
@@ -201,6 +202,10 @@ class HloModule {
   // this point are guaranteed to be in the range [0..NumUniqueInstructionIds())
   int NumUniqueInstructionIds() const { return next_unique_id_; }
 
+  // Returns an id that is unique to this module across all modules created over
+  // the lifetime of this process.
+  int unique_id() const { return unique_id_; }
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -227,6 +232,11 @@ class HloModule {
   NameUniquer computation_name_uniquer_{/*separator=*/"."};
   NameUniquer instruction_name_uniquer_{/*separator=*/"."};
   int next_unique_id_ = 0;
+
+  // Used to keep track of the next unique module id that should be assigned.
+  static std::atomic<int> next_unique_module_id_;
+  // A unique id to label modules with.
+  int unique_id_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index cd51fa4e854..7f28a804bfe 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -188,6 +188,12 @@ TEST_F(HloModuleTest, LargeConstantToString) {
       module->ToString(HloPrintOptions().set_print_large_constants(true)));
 }
 
+TEST_F(HloModuleTest, UniqueModuleId) {
+  auto module_a = CreateNewModule();
+  auto module_b = CreateNewModule();
+  EXPECT_NE(module_a->unique_id(), module_b->unique_id());
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 53bd46a641a..5120775737b 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -32,12 +33,28 @@ using ::tensorflow::strings::StrCat;
 namespace xla {
 
 namespace {
-void DumpModule(const HloModule& module,
-                const string& message) {
+void DumpModuleGraph(const HloModule& module, const string& message) {
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
   XLA_VLOG_LINES(3, module.ToString());
 }
+
+void DumpModuleProto(const HloModule& module, const string& dump_to,
+                     const string& pipeline_name, const string& pass_name) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static auto* const module_id_to_pass_number =
+      new tensorflow::gtl::FlatMap<int64, int64>();
+
+  tensorflow::mutex_lock lock(mu);
+  const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++;
+
+  const string mod_name = SanitizeFileName(tensorflow::strings::Printf(
+      "module_%04d.%04lld.%s.after_%s", module.unique_id(), pass_number,
+      pipeline_name.c_str(), pass_name.c_str()));
+
+  TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(MakeHloProto(module),
+                                                   dump_to, mod_name));
+}
 }  // namespace
 
 StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
@@ -78,6 +95,13 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   string message;
   TF_RETURN_IF_ERROR(
       run_invariant_checkers(StrCat("before running pipeline: ", name())));
+  const string xla_dump_per_pass_hlo_proto_to =
+      module->config().debug_options().xla_dump_per_pass_hlo_proto_to();
+  if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, name().ToString(),
+                    "pipeline_start");
+  }
+
   for (auto& pass : passes_) {
     if (disabled_passes.count(pass->name().ToString()) > 0) {
       VLOG(1) << "  Skipping HLO pass " << pass->name()
@@ -90,17 +114,21 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
     StrAppend(&message, prefix, ", before ", pass->name());
-    DumpModule(*module, message);
+    DumpModuleGraph(*module, message);
 
     TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module));
     TF_RETURN_IF_ERROR(
         run_invariant_checkers(StrCat("after running pass: ", pass->name())));
+    if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+      DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
+                      name().ToString(), pass->name().ToString());
+    }
 
     changed |= changed_this_pass;
     prefix.clear();
     StrAppend(&prefix, name(), ": after ", pass->name());
   }
-  DumpModule(*module, prefix + ", pipeline end");
+  DumpModuleGraph(*module, prefix + ", pipeline end");
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 204a8bf7486..41b079eb799 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -47,22 +47,11 @@ HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
   return tools::Parse(hlo_string, config);
 }
 
-/*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
-                                      const DebugOptions& debug_options) {
-  HloProto proto;
-
-  const Status s =
-      tensorflow::ReadBinaryProto(tensorflow::Env::Default(), filename, &proto);
-
-  if (!s.ok()) {
-    const Status s2 =
-        tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto);
-    if (!s2.ok()) {
-      return Status(s2.code(), s.error_message() + "\n" + s2.error_message());
-    }
-  }
+namespace {
 
+// Creates an HloModule from the given proto.
+StatusOr<std::unique_ptr<HloModule>> HloProtoToModule(
+    const HloProto& proto, const DebugOptions& debug_options) {
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
@@ -72,9 +61,29 @@ HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
   return std::move(module);
 }
 
+}  // namespace
+
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
+HloRunner::ReadModuleFromBinaryProtoFile(const std::string& filename,
                                          const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
+                                                 filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromTextProtoFile(const std::string& filename,
+                                       const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(
+      tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
+                                     const DebugOptions& debug_options) {
   string hlo_string;
   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
                                                   filename, &hlo_string));
@@ -83,19 +92,6 @@ HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
   return tools::Parse(hlo_string, config);
 }
 
-/*static*/ StatusOr<std::unique_ptr<HloModule>> HloRunner::ReadModule(
-    const std::string& filename, const DebugOptions& debug_options) {
-  auto module = HloRunner::ReadModuleFromHloProtoFile(filename, debug_options);
-  if (module.ok()) {
-    return module;
-  }
-  const std::string e = module.status().error_message();
-  module = HloRunner::ReadModuleFromHloTextDumpFile(filename, debug_options);
-  return module.ok() ? std::move(module)
-                     : Status(module.status().code(),
-                              e + "\n" + module.status().error_message());
-}
-
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct HloRunner::EigenThreadPoolWrapper {
@@ -121,12 +117,14 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteInternal(
   if (run_hlo_passes) {
     TF_ASSIGN_OR_RETURN(
         module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor()));
+                    std::move(module), backend().default_stream_executor(),
+                    /*device_allocator=*/nullptr));
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       backend().compiler()->RunBackend(std::move(module),
-                                       backend().default_stream_executor()));
+                                       backend().default_stream_executor(),
+                                       /*device_allocator=*/nullptr));
 
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index d4b221fb52d..cbaebc68bee 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -52,21 +52,15 @@ class HloRunner {
       const DebugOptions& debug_options);
 
   // Reads the proto file in xla.HloProto format, creates and returns the
-  // HloModule. Will try to parse the filename as binary proto, then try as
-  // text proto if that fails.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloProtoFile(
+  // HloModule.
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromBinaryProtoFile(
+      const std::string& filename, const DebugOptions& debug_options);
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromTextProtoFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Reads the hlo text dump file in HloModule::ToString format, creates and
   // returns the HloModule.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextDumpFile(
-      const std::string& filename, const DebugOptions& debug_options);
-
-  // Tries to parse the filename specified first as binary proto format, then
-  // as a textual proto format, then textual IR, then gives up if both fail.
-  // ReadModuleFromHloProtoFile or ReadModuleFromHloTextDumpFile should be used
-  // explicitly when you know the format, this if you don't.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModule(
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Executes the given module with given literals as input and returns the
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 6e46f945e0a..04d46565466 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -687,7 +687,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
       instructions[instruction->name()] = instruction;
     }
 
-    TF_RETURN_IF_ERROR(computation->Accept(shape_verifier_.get()));
+    std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
+    TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
   }
 
   return false;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 5a1d864e03d..26d53dec1e5 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -106,10 +106,14 @@ class ShapeVerifier : public DfsHloVisitor {
 class HloVerifier : public HloPassInterface {
  public:
   // Uses standard shape inference.
-  explicit HloVerifier() : shape_verifier_(MakeUnique<ShapeVerifier>()) {}
+  explicit HloVerifier()
+      : shape_verifier_factory_([] { return MakeUnique<ShapeVerifier>(); }) {}
+
   // Uses custom shape verification.
-  explicit HloVerifier(std::unique_ptr<ShapeVerifier> shape_verifier)
-      : shape_verifier_(std::move(shape_verifier)) {}
+  explicit HloVerifier(
+      std::function<std::unique_ptr<ShapeVerifier>()> shape_verifier_factory)
+      : shape_verifier_factory_(std::move(shape_verifier_factory)) {}
+
   ~HloVerifier() override = default;
   tensorflow::StringPiece name() const override { return "verifier"; }
 
@@ -121,8 +125,11 @@ class HloVerifier : public HloPassInterface {
   // CHECKs various invariants of a fusion instruction.
   Status CheckFusionInstruction(HloInstruction* fusion) const;
 
-  // Verifies shapes match inferred expectations.
-  std::unique_ptr<ShapeVerifier> shape_verifier_;
+  // Creates a ShapeVerifier that checks that shapes match inferred
+  // expectations.  This is a factory function because ShapeVerifier,  Note that
+  // ShapeVerifier, being a DfsHloVisitor, is stateful.  We want a clean object
+  // for each run of the verifier.
+  std::function<std::unique_ptr<ShapeVerifier>()> shape_verifier_factory_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 2a3b55decc5..c92db0be14d 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -97,5 +97,31 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
               HasSubstr("is in a different computation"));
 }
 
+TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
+  HloComputation::Builder builder(TestName());
+  Shape s1 = ShapeUtil::MakeShape(F32, {1});
+  Shape s2 = ShapeUtil::MakeShape(F32, {2});
+
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "param"));
+
+  // Create an add instruction with the incorrect shape.
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kAdd, param, param));
+
+  // In order to trigger the bug we're checking for, the instruction with the
+  // bad shape can't be the root of the computation.
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  // Run the verifier twice.  It should fail both times, because it shouldn't
+  // carry state in its DFS visitor between runs.
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
new file mode 100644
index 00000000000..ada21345014
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+namespace {
+
+// Visitor for removing implicit broadcasts.
+class ImplicitBroadcastVisitor : public DfsHloVisitorWithDefault {
+ public:
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Status::OK();
+  }
+
+  Status HandleElementwiseBinary(HloInstruction* hlo) override {
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  Status HandleClamp(HloInstruction* hlo) override {
+    // Clamp is the only element-wise ternary operation.
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  // Returns whether any modification has been made to any visited instruction.
+  bool changed() const { return changed_; }
+
+ private:
+  // Iterates through the operands of 'hlo' and replace any operands which are
+  // implicitly broadcast with the equivalent sequence of broadcast and reshape
+  // instructions. An operand is considered to be implicitly broadcast if the
+  // operand shape does have the same dimensions as the shape of 'hlo'.
+  Status ReplaceImplicitBroadcastOperands(HloInstruction* hlo) {
+    auto fadd = [hlo](std::unique_ptr<HloInstruction> x) {
+      return hlo->parent()->AddInstruction(std::move(x));
+    };
+    std::vector<HloInstruction*> operands;
+    bool operands_changed = false;
+    for (int i = 0; i < hlo->operand_count(); ++i) {
+      HloInstruction* operand = hlo->mutable_operand(i);
+      if (!ShapeUtil::SameDimensions(hlo->shape(), operand->shape())) {
+        HloInstruction* new_operand = hlo->parent()->AddInstruction(
+            HloInstruction::CreateBroadcastSequence(hlo->shape(), operand,
+                                                    fadd));
+        operands.push_back(new_operand);
+        operands_changed = true;
+      } else {
+        operands.push_back(operand);
+      }
+    }
+    if (operands_changed) {
+      // Create a new HLO instruction because the HloInstruction::Replace*
+      // methods check that the shape does not change with the replacement.
+      HloInstruction* new_hlo = hlo->parent()->AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), operands));
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
+      changed_ = true;
+    }
+    return Status::OK();
+  }
+
+  bool changed_ = false;
+};
+
+}  // namespace
+
+StatusOr<bool> ImplicitBroadcastRemover::Run(HloModule* module) {
+  VLOG(1) << "Removing implicit broadcast from module " << module->name();
+  XLA_VLOG_LINES(2,
+                 "Before removing implicit broadcasts:\n" + module->ToString());
+
+  ImplicitBroadcastVisitor visitor;
+  for (HloComputation* computation : module->computations()) {
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  }
+
+  if (visitor.changed()) {
+    // HLO instructions with implicitly broadcast operands are cloned and left
+    // for dead. Remove them.
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+
+  XLA_VLOG_LINES(2,
+                 "After removing implicit broadcasts:\n" + module->ToString());
+
+  return visitor.changed();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
new file mode 100644
index 00000000000..aa325dc8a35
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass which replaces all implicit broadcasts with their equivalent sequence of
+// explicit broadcast and reshape instructions.
+class ImplicitBroadcastRemover : public HloPassInterface {
+ public:
+  ImplicitBroadcastRemover() {}
+  ~ImplicitBroadcastRemover() override {}
+
+  tensorflow::StringPiece name() const override {
+    return "implicit-broadcast-remover";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
new file mode 100644
index 00000000000..8c7b38dd1bf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+ protected:
+  ImplicitBroadcastRemover remover_;
+};
+
+TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(remover_.Run(&module()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+
+  EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+
+  EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
+
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 1}), "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Subtract(op::Parameter(),
+                                 op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root,
+              op::Subtract(op::Broadcast(op::Parameter()), op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 4, 1, 8}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 1, 6, 8}), "p1"));
+  auto param2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      2, ShapeUtil::MakeShape(F32, {2, 1, 6, 8}), "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest,
+       TernaryScalarAndDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 6}), "p1"));
+  auto param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index dc63a2224d6..c83880e0309 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -70,15 +70,16 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 }
 
 StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> hlo_module,
-    se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
   TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
   return std::move(hlo_module);
 }
 
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
-    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   TF_RET_CHECK(stream_exec != nullptr);
 
   VLOG(1) << "Run backend " << hlo_module->name();
@@ -96,7 +97,8 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> /*hlo_modules*/,
-    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/) {
+    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   return tensorflow::errors::Unimplemented(
       "Compilation of multiple HLO modules is not supported on Interpreter.");
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index 278cf518422..c8660c04d86 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -45,16 +45,19 @@ class InterpreterCompiler : public Compiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) override;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> hlo_modules,
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index bbea6bee565..5413b95cfb6 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1236,7 +1236,8 @@ Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
 // instruction itself.
 Status SetFusionLayouts(HloInstruction* fusion) {
   TF_RET_CHECK(fusion->opcode() == HloOpcode::kFusion);
-  for (auto* fused_instruction : fusion->fused_instructions()) {
+  for (auto* fused_instruction :
+       fusion->fused_instructions_computation()->MakeInstructionPostOrder()) {
     if (fused_instruction->opcode() == HloOpcode::kParameter) {
       const HloInstruction* fusion_operand =
           fusion->operand(fused_instruction->parameter_number());
@@ -1251,11 +1252,22 @@ Status SetFusionLayouts(HloInstruction* fusion) {
           ShapeUtil::Compatible(fusion->shape(), fused_instruction->shape()));
       TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
           fusion->shape(), fused_instruction->mutable_shape()));
-    } else if (fused_instruction->opcode() != HloOpcode::kConstant &&
-               fused_instruction->opcode() != HloOpcode::kGetTupleElement &&
-               fused_instruction->opcode() != HloOpcode::kInfeed) {
-      // Internal fused instructions with the exception of constants
-      // and infeed need no layout.
+    } else if (fused_instruction->opcode() == HloOpcode::kGetTupleElement) {
+      // A GTE inherits its layout from its operand (which should ultimately be
+      // a parameter).
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->operand(0)->shape().tuple_shapes(
+              fused_instruction->tuple_index()),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kConstant) {
+      // Give constants the layout of their literal.
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->literal().shape(),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kInfeed) {
+      // Nop; leave the infeed layout alone.
+    } else {
+      // Other instructions don't have layouts inside of fusion nodes.
       LayoutUtil::ClearLayout(fused_instruction->mutable_shape());
     }
   }
@@ -1367,20 +1379,6 @@ Status LayoutAssignment::RunOnComputation(
           << ")";
   VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
 
-  // Clear existing layouts of the instructions. All layouts must be assigned by
-  // the LayoutAssignment pass, except for Infeed, Outfeed, Parameters and the
-  // computation result. The latter two are specified in computation_layout, so
-  // we only need to keep the existing layouts for Infeed and Outfeed. Clearing
-  // the layouts here avoids hiding potential bugs in the layout assignment pass
-  // that may accidently use the existing layout.
-  for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kInfeed ||
-        instruction->opcode() == HloOpcode::kOutfeed) {
-      continue;
-    }
-    LayoutUtil::ClearLayout(instruction->mutable_shape());
-  }
-
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
 
@@ -1458,6 +1456,18 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   // is handled before its caller computation. This ensures that the layout of
   // all callers of a computation will agree.
   for (auto* computation : module->MakeComputationPostOrder()) {
+    // Clear existing layouts of the instructions.  All layouts must be assigned
+    // by the LayoutAssignment pass, except for those on infeeds, parameters,
+    // and the computation result. The latter two are specified in
+    // computation_layout, so we only need to keep the existing layouts for
+    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+    // layout assignment pass that may accidently use the existing layout.
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kInfeed) {
+        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      }
+    }
+
     if (computation == module->entry_computation()) {
       TF_RETURN_IF_ERROR(RunOnComputation(
           *entry_computation_layout_, *points_to_analysis,
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index d51c0d1dfb7..e269a13459f 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -35,9 +35,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -587,5 +589,74 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
+
+// A GTE inside of a fusion node inherits the layout of its operand (which
+// should, if we keep following operands, eventually be a parameter).
+TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
+  const char* module_str = R"(
+    HloModule test_module
+
+    fused_computation {
+      fparam = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      gte0 = f32[2,2,2] get-tuple-element(fparam), index=0
+      gte1 = (f32[2,2,2], f32[2,2,2]) get-tuple-element(fparam), index=1
+      gte1a = f32[2,2,2] get-tuple-element(gte1), index=0
+      gte1b = f32[2,2,2] get-tuple-element(gte1), index=1
+      add = f32[2,2,2] add(gte1a, gte1b)
+      ROOT fresult = f32[2,2,2] add(gte0, add)
+    }
+
+    ENTRY entry_computation {
+      param = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      ROOT fusion =
+        f32[2,2,2] fusion(param), kind=kLoop, calls=fused_computation
+    }
+  )";
+
+  auto module = tools::Parse(module_str).ValueOrDie();
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape());
+  Shape param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
+       ShapeUtil::MakeTupleShape({
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {1, 2, 0}),
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {2, 0, 1}),
+       })});
+  TF_ASSERT_OK(
+      computation_layout.mutable_parameter_layout(0)->CopyLayoutFromShape(
+          param_shape));
+  computation_layout.mutable_result_layout()->ResetLayout(
+      LayoutUtil::MakeLayout({2, 1, 0}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  HloComputation* fused_computation = *std::find_if(
+      module->computations().begin(), module->computations().end(),
+      [](const HloComputation* c) { return c->name() == "fused_computation"; });
+
+  auto fused_instr = [&](const string& name) {
+    auto it = std::find_if(
+        fused_computation->instructions().begin(),
+        fused_computation->instructions().end(),
+        [&](const HloInstruction* i) { return i->name() == name; });
+    CHECK(it != fused_computation->instructions().end());
+    return *it;
+  };
+
+  EXPECT_THAT(fused_instr("gte0")->shape().layout().minor_to_major(),
+              ElementsAre(0, 1, 2));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(0).layout().minor_to_major(),
+      ElementsAre(1, 2, 0));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(1).layout().minor_to_major(),
+      ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("gte1a")->shape().layout().minor_to_major(),
+              ElementsAre(1, 2, 0));
+  EXPECT_THAT(fused_instr("gte1b")->shape().layout().minor_to_major(),
+              ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("fresult")->shape().layout().minor_to_major(),
+              ElementsAre(2, 1, 0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 34f3419269a..f98fc0400a7 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-        stream_execs) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    DeviceMemoryAllocator* device_allocator) {
   std::vector<std::unique_ptr<Executable>> result;
   for (size_t i = 0; i < modules.size(); i++) {
     if (stream_execs[i].size() != 1) {
@@ -27,10 +27,12 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
           "Model partitioning not implemented for the CPU/GPU compilers!");
     }
 
-    TF_ASSIGN_OR_RETURN(
-        modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0]));
+    TF_ASSIGN_OR_RETURN(modules[i],
+                        RunHloPasses(std::move(modules[i]), stream_execs[i][0],
+                                     device_allocator));
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        RunBackend(std::move(modules[i]), stream_execs[i][0]));
+                        RunBackend(std::move(modules[i]), stream_execs[i][0],
+                                   device_allocator));
     result.push_back(std::move(executable));
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index c5393cef4f9..d74e81bb7f6 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -60,17 +60,20 @@ class LLVMCompiler : public Compiler {
   // Bring in
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs) override;
+          stream_execs,
+      DeviceMemoryAllocator* device_allocator) override;
 
  protected:
   ModuleHook user_pre_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 9ad7cd82cb8..242062e6167 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -34,6 +34,12 @@ namespace xla {
 
 // Unlike IrEmitter, this creates host functions which emit IR to generate the
 // output element at the given index. It is used to generate fused operations.
+//
+// This class handles both vanilla fusion and multi-output fusion.  In the MOF
+// case, the fusion node ends with a kTuple instruction, and the root generator
+// returned by this emitter returns an LLVM struct with N elements, one for each
+// element of the arrays in the tuple.  It follows that the arrays in the tuple
+// must have the same length.
 class FusedIrEmitter : public DfsHloVisitorWithDefault {
  public:
   using Generator = llvm_ir::ElementGenerator;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index a5f7c850c33..b6b918ec78a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -51,37 +51,40 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
       shape_(target_array.GetShape()),
       ir_builder_(ir_builder) {}
 
+static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
+    const ElementGenerator& target_element_generator,
+    const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* ir_builder) {
+  return [=](const llvm_ir::IrArray::Index array_index) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
+                        target_element_generator(array_index));
+    CHECK(target_element->getType()->isStructTy())
+        << "This BodyEmitter is for multi-output fusion, but target element "
+           "generator does not produce values of struct type.";
+    CHECK_EQ(target_element->getType()->getStructNumElements(),
+             target_arrays.size());
+
+    for (int64 i = 0; i < target_arrays.size(); ++i) {
+      target_arrays[i].EmitWriteArrayElement(
+          array_index, ir_builder->CreateExtractValue(target_element, i),
+          ir_builder);
+    }
+    return Status::OK();
+  };
+}
+
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          tensorflow::gtl::ArraySlice<IrArray> target_arrays,
                          llvm::IRBuilder<>* ir_builder)
-    : body_emitter_([=](const llvm_ir::IrArray::Index array_index)
-                        -> ::tensorflow::Status {
-        // Convert target_element_generator to a BodyEmitter.
-        TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
-                            target_element_generator(array_index));
-        if (target_arrays.size() == 1) {
-          target_arrays[0].EmitWriteArrayElement(array_index, target_element,
-                                                 ir_builder);
-          return tensorflow::Status::OK();
-        }
-
-        for (int64 i = 0; i < target_arrays.size(); ++i) {
-          target_arrays[i].EmitWriteArrayElement(
-              array_index, ir_builder_->CreateExtractValue(target_element, i),
-              ir_builder);
-        }
-        return tensorflow::Status::OK();
-      }),
+    : body_emitter_(MakeBodyEmitterForMultiOutputFusion(
+          target_element_generator,
+          std::vector<IrArray>(target_arrays.begin(), target_arrays.end()),
+          ir_builder)),
+      shape_(target_arrays[0].GetShape()),
       ir_builder_(ir_builder) {
-  if (target_arrays.size() > 1) {
-    // The sanity check for multiple outputs.
-    shape_ = target_arrays[0].GetShape();
-    for (int64 i = 1; i < target_arrays.size(); ++i) {
-      const Shape& element_shape = target_arrays[i].GetShape();
-      CHECK(ShapeUtil::SameDimensions(shape_, element_shape));
-    }
-  } else {
-    shape_ = target_arrays[0].GetShape();
+  // Sanity check: In multi-output fusion, all shapes produced must have the
+  // same dimensions.
+  for (const IrArray& array : target_arrays) {
+    CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 1ef1dc24644..0fc528439a0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -47,10 +47,16 @@ class LoopEmitter {
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
               const IrArray& target_array, llvm::IRBuilder<>* ir_builder);
-  // Same as previous method except emits multiple targets in an array.
+
+  // Constructs a LoopEmitter that emits one element into each of N separate
+  // arrays on each iteration of the loop.
+  //
+  // This is used for multi-output fusion.  target_element_generator must
+  // produce an LLVM struct with N elements.
   LoopEmitter(const ElementGenerator& target_element_generator,
               tensorflow::gtl::ArraySlice<IrArray> target_arrays,
               llvm::IRBuilder<>* ir_builder);
+
   LoopEmitter(const LoopEmitter&) = delete;
   LoopEmitter& operator=(const LoopEmitter&) = delete;
   virtual ~LoopEmitter() = default;
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index f30530db08e..07f989d4fae 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -71,7 +72,7 @@ LocalService::LocalService(const ServiceOptions& options,
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const Shape* result_layout, int device_ordinal) {
+    const ExecutableBuildOptions& build_options) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(computation));
   VersionedComputationHandle versioned_handle =
@@ -112,14 +113,19 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
           ShapeUtil::HumanString(argument_shape).c_str());
     }
   }
-  if (result_layout != nullptr) {
-    TF_RETURN_IF_ERROR(
-        ValidateResultShapeWithLayout(*result_layout, program_shape->result()));
+  if (build_options.result_layout() != nullptr) {
+    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
+        *build_options.result_layout(), program_shape->result()));
   }
 
   ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (result_layout != nullptr) {
-    *execution_options.mutable_shape_with_output_layout() = *result_layout;
+  if (build_options.generate_hlo_graph().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
+        build_options.generate_hlo_graph().value());
+  }
+  if (build_options.result_layout() != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *build_options.result_layout();
   } else {
     *execution_options.mutable_shape_with_output_layout() =
         program_shape->result();
@@ -131,11 +137,13 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
       CreateModuleConfig(*program_shape, argument_layouts, &execution_options,
                          *user_computation));
 
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                      execute_backend_->stream_executor(device_ordinal));
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(build_options.device_ordinal()));
 
   return BuildExecutable(versioned_handle, std::move(module_config),
-                         execute_backend_.get(), executor);
+                         execute_backend_.get(), executor,
+                         build_options.device_allocator());
 }
 
 StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index acbc7268252..15e120685e1 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -41,11 +42,13 @@ class LocalService : public Service {
 
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
-  // result of the given layout.
+  // result of the given layout.  If device_allocator is non-null, then the
+  // compiler may use it to allocate temp space on the device.  The compiler is
+  // responsible for freeing any memory it allocates this way.
   StatusOr<std::unique_ptr<Executable>> CompileExecutable(
       const ComputationHandle& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const Shape* result_layout, int device_ordinal);
+      const ExecutableBuildOptions& options);
 
   // Returns the device ordinal that corresponds to the given replica number.
   //
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 849df1d8e63..98dfc89867a 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -337,7 +337,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<VersionedComputationHandle> versioned_handles,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
@@ -383,7 +384,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), std::move(executors)));
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
 
   for (size_t i = 0; i < versioned_handles.size(); ++i) {
     if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
@@ -396,8 +398,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    Backend* backend, se::StreamExecutor* executor) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
                     versioned_handle.ToString().c_str());
 
@@ -430,11 +432,12 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
 
   TF_ASSIGN_OR_RETURN(
-      module, backend->compiler()->RunHloPasses(std::move(module), executor));
+      module, backend->compiler()->RunHloPasses(std::move(module), executor,
+                                                device_allocator));
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      backend->compiler()->RunBackend(std::move(module), executor));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      backend->compiler()->RunBackend(
+                          std::move(module), executor, device_allocator));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -445,9 +448,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    Backend* backend, perftools::gputools::StreamExecutor* executor,
-    ExecutionProfile* profile) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    DeviceMemoryAllocator* device_allocator) {
   std::shared_ptr<Executable> executable =
       compilation_cache_.LookUp(versioned_handle, *module_config);
 
@@ -469,7 +472,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable_unique_ptr,
       BuildExecutable(versioned_handle, std::move(module_config), backend,
-                      executor));
+                      executor, device_allocator));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -771,10 +774,14 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
   // Build the user computations into HloModules and compile to generate the
   // executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteParallel, so we have to pass a null device_allocator below.
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
       BuildExecutables(versioned_handles, std::move(module_configs),
-                       execute_backend_.get(), all_executors));
+                       execute_backend_.get(), all_executors,
+                       /*device_allocator=*/nullptr));
   std::vector<Executable*> executable_ptrs;
   executable_ptrs.reserve(executables.size());
   for (const auto& executable : executables) {
@@ -1446,9 +1453,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddInfeedInstruction(arg->infeed_request());
       break;
     case OpRequest::kOutfeedRequest:
-      TF_RETURN_IF_ERROR(
-          computation->AddOutfeedInstruction(arg->outfeed_request()));
-      return tensorflow::Status::OK();
+      handle_status =
+          computation->AddOutfeedInstruction(arg->outfeed_request());
+      break;
     case OpRequest::kMapRequest: {
       TF_ASSIGN_OR_RETURN(
           UserComputation * to_apply,
@@ -1612,14 +1619,14 @@ StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
 }
 
 Status Service::MaybeDumpHloModule(const HloModule& module) const {
-  const string xla_dump_prepass_hlo_proto_to =
-      module.config().debug_options().xla_dump_prepass_hlo_proto_to();
-  if (xla_dump_prepass_hlo_proto_to.empty()) {
+  const string xla_dump_unoptimized_hlo_proto_to =
+      module.config().debug_options().xla_dump_unoptimized_hlo_proto_to();
+  if (xla_dump_unoptimized_hlo_proto_to.empty()) {
     return Status::OK();
   }
   HloProto proto = MakeHloProto(module);
   return protobuf_util::DumpProtoToDirectory(
-      proto, xla_dump_prepass_hlo_proto_to, module.name());
+      proto, xla_dump_unoptimized_hlo_proto_to, module.name());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index ca77e8fe3a6..6ce24197115 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -280,10 +280,15 @@ class Service : public ServiceInterface {
       const UserComputation& user_computation);
 
   // Builds an Executable for the given parameters.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // buffers, which the compiler is responsible for freeing.  The allocator
+  // given here need not match the allocator used when running the executable.
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      Backend* backend, perftools::gputools::StreamExecutor* executor);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
@@ -291,16 +296,17 @@ class Service : public ServiceInterface {
       std::vector<VersionedComputationHandle> versioned_handles,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors);
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
   // executable first. If the executable is not in the cache, it is built and
   // inserted into the cache.
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      Backend* backend, perftools::gputools::StreamExecutor* executor,
-      ExecutionProfile* profile);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index a6d6c8b27f8..4ba6da6ccc4 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
+using tensorflow::str_util::Join;
+using tensorflow::strings::Printf;
+
 namespace xla {
 
 namespace {
@@ -934,7 +937,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
       BinaryOperation_Name(operation).c_str(),
       ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
-      tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
+      Join(broadcast_dimensions, ", ").c_str());
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
@@ -1097,7 +1100,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
         "%s",
-        tensorflow::str_util::Join(pieces, ", ").c_str());
+        Join(pieces, ", ").c_str());
   }
 
   // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@@ -1114,7 +1117,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (dimensions[i] != i) {
       return InvalidArgument(
           "Map requires monotonically increasing dimension numbers, found: %s ",
-          tensorflow::str_util::Join(dimensions, ", ").c_str());
+          Join(dimensions, ", ").c_str());
     }
   }
 
@@ -1914,21 +1917,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
     tensorflow::gtl::ArraySlice<int64> limits,
     tensorflow::gtl::ArraySlice<int64> strides) {
+  auto error = [&](const string& message) {
+    return InvalidArgument(
+        "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
+        "{%s}; strides: {%s}",
+        message.c_str(), ShapeUtil::HumanString(arg).c_str(),
+        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
+        Join(strides, ",").c_str());
+  };
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(),
-      tensorflow::str_util::Join(starts, ", ").c_str(),
-      tensorflow::str_util::Join(limits, ", ").c_str());
+      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
+      Join(limits, ", ").c_str());
 
   if (starts.size() != limits.size()) {
-    return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
-                           starts.size(), limits.size());
+    return error(Printf("slice start and limit sizes differ: %zu vs %zu",
+                        starts.size(), limits.size()));
   }
 
   if (starts.size() != strides.size()) {
-    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
-                           starts.size(), strides.size());
+    return error(Printf("slice start and strides sizes differ: %zu vs %zu",
+                        starts.size(), strides.size()));
   }
 
   if (starts.size() != ShapeUtil::Rank(arg)) {
@@ -1947,20 +1957,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                              start_index);
     }
     if (limit_index > arg.dimensions(dimension)) {
-      return InvalidArgument(
-          "limit index (%lld) must be less than or equal to dimension "
-          "size (%lld)",
-          limit_index, arg.dimensions(dimension));
+      return error(
+          Printf("limit index (%lld) must be less than or equal to dimension "
+                 "size (%lld)",
+                 limit_index, arg.dimensions(dimension)));
     }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
     if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice with positive stride",
-          limit_index, start_index);
+      return error(
+          Printf("limit index (%lld) must be greater or equal to "
+                 "start index (%lld) in slice with positive stride",
+                 limit_index, start_index));
     }
     if (stride <= 0) {
       return InvalidArgument("stride (%lld) must be positive", stride);
@@ -1983,7 +1993,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
       ShapeUtil::HumanString(operand_shape).c_str(),
       ShapeUtil::HumanString(start_indices_shape).c_str(),
-      tensorflow::str_util::Join(slice_sizes, ", ").c_str());
+      Join(slice_sizes, ", ").c_str());
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
@@ -2280,8 +2290,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Reshape dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
-        ShapeUtil::HumanString(operand).c_str());
+        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
   }
 
   return inferred_shape;
@@ -2373,8 +2382,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // The applied function's arity equals the number of arguments.
   if (arg_shapes.size() != to_apply.parameters_size()) {
     string computation_signature = ShapeUtil::HumanString(to_apply);
-    string argument_shapes = tensorflow::str_util::Join(
-        arg_shapes, ", ", [](string* out, const Shape* shape) {
+    string argument_shapes =
+        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
           tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
         });
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 99d87f3b550..026c0211657 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1512,5 +1512,20 @@ TEST_F(ShapeInferenceTest, Conditional) {
                         "must have the same shape"));
 }
 
+TEST_F(ShapeInferenceTest, BadSlice) {
+  auto arg = ShapeUtil::MakeShape(F32, {4});
+  StatusOr<Shape> statusor =
+      ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
+  ASSERT_FALSE(statusor.ok());
+
+  LOG(INFO) << statusor.status();
+
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("less than or equal to dimension size"))
+      << statusor.status();
+  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 2ea6507900e..ef9c80b0431 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1185,7 +1185,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddInfeedInstruction(
   return handle;
 }
 
-Status UserComputation::AddOutfeedInstruction(
+StatusOr<ComputationDataHandle> UserComputation::AddOutfeedInstruction(
     const OutfeedRequest& outfeed_request) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -1197,8 +1197,6 @@ Status UserComputation::AddOutfeedInstruction(
   // Verify that operand is valid.
   TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status());
 
-  // No handle is returned, but a handle must be assigned to this instruction
-  // for computation versioning.
   ComputationDataHandle handle = CreateComputationDataHandle();
   OperationRequest& request =
       (*session_computation_.mutable_requests())[handle.handle()];
@@ -1209,7 +1207,7 @@ Status UserComputation::AddOutfeedInstruction(
   VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal()
           << "), data handle " << handle.handle() << ": "
           << outfeed_request.ShortDebugString();
-  return Status::OK();
+  return handle;
 }
 
 StatusOr<ComputationDataHandle> UserComputation::AddCallInstruction(
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index 4f92e58877a..54bb24d6d7f 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -146,7 +146,8 @@ class UserComputation {
       const InfeedRequest& infeed_request);
 
   // Enqueues an outfeed instruction onto this user computation.
-  Status AddOutfeedInstruction(const OutfeedRequest& outfeed_request);
+  StatusOr<ComputationDataHandle> AddOutfeedInstruction(
+      const OutfeedRequest& outfeed_request);
 
   // Enqueues a call instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddCallInstruction(
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index ca02115863e..2fa163953f6 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -67,7 +67,8 @@ TEST_F(UserComputationTest, SimpleComputation) {
   *outfeed_request.mutable_operand() = constant_handle;
   *outfeed_request.mutable_shape() = kVectorShape;
   outfeed_request.set_outfeed_config("abc");
-  TF_ASSERT_OK(computation.AddOutfeedInstruction(outfeed_request));
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle outfeed_handle,
+                          computation.AddOutfeedInstruction(outfeed_request));
 
   auto hlo_resolver = [](const VersionedComputationHandle& handle) {
     return nullptr;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index cba73322fa9..d63e16ce2bf 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -475,8 +475,6 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
       if (LayoutUtil::HasLayout(shape)) {
         tensorflow::strings::StrAppend(&result,
                                        LayoutUtil::HumanString(shape.layout()));
-      } else {
-        tensorflow::strings::StrAppend(&result, "{no layout}");
       }
     }
     return result;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 4410647f848..d4820d1b6d6 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -578,6 +578,7 @@ xla_test(
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 56fc21d019b..52e14a1f7b9 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1893,6 +1893,26 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0, -5});
+  auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<int32>({3, 0, 25, 5, 123, -1});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<int32>(&builder, {2, 0, 1, 2, 4, -1}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<uint32>({1, 2, 1, 2, 0, ~0u - 4});
+  auto arg_vector = builder.ConstantR1<uint32>({2, 10, 5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<uint32>({3, 5, 25, 5, 123, ~0u});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<uint32>(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
   ComputationBuilder builder(client_, TestName());
 
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index e472408dcf7..022641394f1 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -21,9 +21,11 @@ StatusOr<std::unique_ptr<Executable>> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
   TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses(
                                       std::move(hlo_module),
-                                      backend().default_stream_executor()));
+                                      backend().default_stream_executor(),
+                                      /*device_allocator=*/nullptr));
   return backend().compiler()->RunBackend(std::move(hlo_module),
-                                          backend().default_stream_executor());
+                                          backend().default_stream_executor(),
+                                          /*device_allocator=*/nullptr);
 }
 
 StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 7c1a993b478..9f5806c5e16 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -230,7 +230,7 @@ template <typename LiteralPtr>
     const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
-      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
     return ::testing::AssertionFailure()
            << "failed reading hlo module from file";
@@ -258,7 +258,7 @@ template <typename LiteralPtr>
     const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
     const std::function<void(HloModule*)>& reference_preprocessor) {
   auto module_or_status =
-      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
     return ::testing::AssertionFailure()
            << "failed reading hlo module from file";
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index f8205de702f..474d2547aeb 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -355,9 +355,9 @@ class NearComparator {
   // temporary files on failure. Returns true if  literals match.
   bool ExpectNear(const Literal& expected, const Literal& actual) {
     VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, expected.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected));
     VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, actual.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual));
 
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
@@ -376,7 +376,12 @@ class NearComparator {
     abs_expected_miscompare_sum_ = 0.0;
     max_rel_err_ = 0.0;
     max_abs_err_ = 0.0;
+    first_linear_index_ = -1;
+    last_linear_index_ = -1;
+    max_rel_linear_index_ = -1;
+    max_abs_linear_index_ = -1;
     miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
+    miscompares_.PopulateWithValue(false);
     multi_index_.resize(expected.shape().dimensions_size(), 0);
 
     switch (expected.shape().element_type()) {
@@ -404,21 +409,33 @@ class NearComparator {
     if (num_miscompares_ > 0) {
       if (!VLOG_IS_ON(1)) {
         LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << expected.ToString();
+                  << " " << TruncateHugeLiteral(expected);
         LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << actual.ToString();
+                  << " " << TruncateHugeLiteral(actual);
+        LOG(INFO) << "Dumping literals to temp files...";
+        WriteLiteralToTempFile(expected, "expected");
+        WriteLiteralToTempFile(actual, "actual");
+        WriteLiteralToTempFile(miscompares_, "miscompares");
       }
       EXPECT_TRUE(num_miscompares_ == 0)
           << "\nmax relative mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_rel_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_rel_linear_index_))
           << "\nmaximum relative error " << max_rel_err_
           << "\nmax absolute mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_abs_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_abs_linear_index_))
           << "\nmaximum absolute error " << max_abs_err_
           << "\nfirst mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(first_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), first_linear_index_))
           << "\nlast mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(last_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), last_linear_index_))
           << "\ntotal absolute error " << abs_diff_sum_
           << "\ntotal absolute error of miscompares "
           << abs_diff_miscompare_sum_ << "\ntotal relative error "
@@ -426,10 +443,6 @@ class NearComparator {
           << "\ntotal relative error of miscompares "
           << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_)
           << "\nfailure count " << num_miscompares_;
-
-      WriteLiteralToTempFile(expected, "expected");
-      WriteLiteralToTempFile(actual, "actual");
-      WriteLiteralToTempFile(miscompares_, "miscompares");
     }
     return num_miscompares_ == 0;
   }
@@ -457,57 +470,93 @@ class NearComparator {
       return true;
     }
 
-    float abs_diff = std::abs(actual - expected);
-    float rel_err = abs_diff / std::abs(expected);
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
+    const bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
+    const bool mismatch =
+        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
+    return !mismatch;
+  }
+
+  // Assumes that expected vs actual fail ExpectValuesNear.
+  template <typename NativeT>
+  void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual,
+                               const Shape& shape, const int64 linear_index) {
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
     abs_diff_sum_ += abs_diff;
     abs_expected_sum_ += std::abs(expected);
-    if (rel_err > max_rel_err_) {
+    if (rel_err > max_rel_err_ || std::isnan(rel_err)) {
       max_rel_err_ = rel_err;
-      max_rel_multi_index_ = multi_index_;
+      max_rel_linear_index_ = linear_index;
     }
-    if (abs_diff > max_abs_err_) {
+    if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) {
       max_abs_err_ = abs_diff;
-      max_abs_multi_index_ = multi_index_;
+      max_abs_linear_index_ = linear_index;
     }
-    VLOG(10) << tensorflow::strings::Printf(
-        "index %s abs_diff %f rel_err %f",
-        LiteralTestUtil::MultiIndexAsString(multi_index_).c_str(), abs_diff,
-        rel_err);
-    bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
-    bool mismatch =
-        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
-    if (mismatch) {
-      abs_diff_miscompare_sum_ += abs_diff;
-      abs_expected_miscompare_sum_ += std::abs(expected);
-      const int64 kMaxFailures = 2;
-      if (num_miscompares_ < kMaxFailures) {
-        ::testing::Message msg;
-        msg << "mismatch at index "
-            << LiteralTestUtil::MultiIndexAsString(multi_index_) << " abs diff "
-            << abs_diff << " rel err " << rel_err << " failure #"
-            << num_miscompares_;
-        ExpectNear<NativeT>(expected, actual, msg);
-      } else if (num_miscompares_ == kMaxFailures) {
-        LOG(ERROR)
-            << "reached max 'loud' failure count; silently proceeding...";
-      }
-      if (num_miscompares_ == 0) {
-        first_multi_index_ = multi_index_;
-      }
-      num_miscompares_++;
-      last_multi_index_ = multi_index_;
+    if (VLOG_IS_ON(10)) {
+      VLOG(10) << tensorflow::strings::Printf(
+          "index %s abs_diff %f rel_err %f",
+          LiteralTestUtil::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          abs_diff, rel_err);
     }
-    return !mismatch;
+    abs_diff_miscompare_sum_ += abs_diff;
+    abs_expected_miscompare_sum_ += std::abs(expected);
+    const int64 kMaxFailures = 2;
+    if (num_miscompares_ < kMaxFailures) {
+      const auto multi_index =
+          IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
+      ::testing::Message msg;
+      msg << "mismatch at index "
+          << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff "
+          << abs_diff << " rel err " << rel_err << " failure #"
+          << num_miscompares_;
+      ExpectNear<NativeT>(expected, actual, msg);
+    } else if (num_miscompares_ == kMaxFailures) {
+      LOG(ERROR) << "reached max 'loud' failure count; silently proceeding...";
+    }
+    if (num_miscompares_ == 0) {
+      first_linear_index_ = linear_index;
+    }
+    num_miscompares_++;
+    last_linear_index_ = linear_index;
+    miscompares_.data<bool>()[linear_index] = true;
   }
 
   // Recursive function which compares the two given literals elementwise.
   template <typename NativeT>
   void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
                           int64 dimension) {
+    // Fast path optimization for the case were layouts match.
+    if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) {
+      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
+          expected.data<NativeT>();
+      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
+          actual.data<NativeT>();
+      const int64 len = expected_data.size();
+      for (int64 i = 0; i < len; ++i) {
+        const bool near = ExpectValuesNear(expected_data[i], actual_data[i]);
+        if (!near) {
+          UpdateAndLogMiscompares<NativeT>(expected_data[i], actual_data[i],
+                                           actual.shape(), i);
+        }
+      }
+      return;
+    }
+
     if (dimension == expected.shape().dimensions_size()) {
       bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
                                    actual.Get<NativeT>(multi_index_));
-      miscompares_.Set<bool>(multi_index_, !near);
+      if (!near) {
+        UpdateAndLogMiscompares<NativeT>(
+            expected.Get<NativeT>(multi_index_),
+            actual.Get<NativeT>(multi_index_), actual.shape(),
+            IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(),
+                                                          multi_index_));
+      }
     } else {
       for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
         multi_index_[dimension] = i;
@@ -528,6 +577,32 @@ class NearComparator {
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
+  // Gets the total element count.  For tuples, this is not the count of tuple
+  // elements, but the sum of elements of each tuple element.
+  int64 RecursiveElementCount(const Shape& shape) {
+    if (ShapeUtil::IsTuple(shape)) {
+      const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+      int64 total = 0;
+      for (int64 i = 0; i < tuple_elements; ++i) {
+        total +=
+            RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+      }
+      return total;
+    } else {
+      return ShapeUtil::ElementsIn(shape);
+    }
+  }
+
+  // Calling ToString on a literal with over 100 million elements takes around
+  // 3 minutes.  The utility of printing a literal with >1000 elements is
+  // questionable, especially when writing the Literal proto to disk is orders
+  // of magnitude faster.
+  string TruncateHugeLiteral(const Literal& literal) {
+    return RecursiveElementCount(literal.shape()) < 1000
+               ? literal.ToString()
+               : "[TRUNCATED, Literal with more than 1000 values]";
+  }
+
   ErrorSpec error_;
 
   // Number of element miscomparisons encountered so far.
@@ -548,10 +623,10 @@ class NearComparator {
   double abs_expected_miscompare_sum_;
   float max_rel_err_;
   float max_abs_err_;
-  std::vector<int64> first_multi_index_;
-  std::vector<int64> last_multi_index_;
-  std::vector<int64> max_rel_multi_index_;
-  std::vector<int64> max_abs_multi_index_;
+  int64 first_linear_index_;
+  int64 last_linear_index_;
+  int64 max_rel_linear_index_;
+  int64 max_abs_linear_index_;
 };
 
 template <>
@@ -584,6 +659,23 @@ bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
                           static_cast<float>(std::move(actual)));
 }
 
+template <>
+void NearComparator::UpdateAndLogMiscompares<bfloat16>(
+    const bfloat16 expected, const bfloat16 actual, const Shape& shape,
+    const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(expected),
+                          static_cast<float>(actual), shape, linear_index);
+}
+
+template <>
+void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
+                                                   const Shape& shape,
+                                                   const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(std::move(expected)),
+                          static_cast<float>(std::move(actual)), shape,
+                          linear_index);
+}
+
 }  // namespace
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index e477784557a..3a421f84582 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -97,5 +97,29 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   }
 }
 
+TEST(LiteralTestUtilTest, NearComparatorR1) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, NearComparatorR1Nan) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtil, NearComparatorDifferentLengths) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b = Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7});
+  EXPECT_FALSE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+  EXPECT_FALSE(LiteralTestUtil::Near(*b, *a, ErrorSpec{0.0001}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index b5b95967ff9..7e92439c494 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -74,7 +74,8 @@ class LLVMCompilerTest : public ::testing::Test {
 
     ASSERT_TRUE(compiler
                     ->RunBackend(std::move(hlo_module),
-                                 backend_->default_stream_executor())
+                                 backend_->default_stream_executor(),
+                                 /*device_allocator=*/nullptr)
                     .ok());
 
     // Test that hooks were called.
@@ -98,7 +99,8 @@ class LLVMCompilerTest : public ::testing::Test {
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
-    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors)));
+    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors),
+                                   /*device_allocator=*/nullptr));
   }
 
  private:
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 3fd83a4c3b1..8cef8dd34dc 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -33,6 +33,14 @@ limitations under the License.
 namespace xla {
 namespace {
 
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
 class PadTest : public ClientLibraryTestBase {
  protected:
   PadTest() {
@@ -61,8 +69,22 @@ class PadTest : public ClientLibraryTestBase {
   PaddingConfig r4_padding_on_dim0_dim1_;
 };
 
+class PadTestFloat : public PadTest,
+                     public ::testing::WithParamInterface<bool> {
+ protected:
+  PadTestFloat() { set_use_bfloat16(GetParam()); }
+
+  ErrorSpec DefaultErrorSpec() const {
+    if (use_bfloat16()) {
+      return ErrorSpec(1e-3, 1e-3);
+    } else {
+      return ErrorSpec(1e-5, 1e-5);
+    }
+  }
+};
+
 // Tests a Pad() with a zero-element input and output.
-XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
@@ -71,12 +93,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(0);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
-  ComputeAndCompareR1<float>(&b, {}, {}, ErrorSpec(0.0001));
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
+  ComputeAndCompareR1<float>(&b, {}, {}, DefaultErrorSpec());
 }
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
-XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -85,12 +108,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
   dimension->set_edge_padding_high(4);
   dimension->set_interior_padding(7);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad1DS3Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -99,21 +123,21 @@ XLA_TEST_F(PadTest, Pad1DS3Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(1);
 
-  b.Pad(b.ConstantR1<float>({1, 2, 3}), b.ConstantR0<float>(0.1),
-        padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({1, 2, 3}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   std::vector<float> expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3});
-  ComputeAndCompareR1<float>(&b, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad4D_2x0x3x2_FloatArray) {
+XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
   ComputationBuilder b(client_, TestName());
-  b.Pad(b.ConstantR4FromArray4D<float>(Array4D<float>(2, 0, 3, 2)),
-        b.ConstantR0<float>(1.5), r4_padding_on_dim0_dim1_);
+  b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
+        AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
+TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   ComputationBuilder b(client_, TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
@@ -123,7 +147,7 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<float>(*input), b.ConstantR0<float>(1.5),
+  b.Pad(AddParam(*input, &b), AddParam(*Literal::CreateR0<float>(1.5), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(2, 3, 3, 2);
@@ -134,15 +158,15 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   (*expected)(1, 0, 1, 1) = 4.0f;
   (*expected)(1, 0, 2, 0) = 5.0f;
   (*expected)(1, 0, 2, 1) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
   ComputationBuilder b(client_, TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
-  b.Pad(b.ConstantR4FromArray4D<float>(input), b.ConstantR0<float>(pad_value),
+  b.Pad(AddParam(input, &b), AddParam(*Literal::CreateR0<float>(pad_value), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(8, 5, 1, 1);
@@ -156,7 +180,7 @@ TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
   ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
+TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -184,7 +208,8 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 1, 5, 8);
   expected_array.Fill(pad_value);
@@ -197,7 +222,7 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
+XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -229,7 +254,8 @@ XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 25, 17, 11);
   expected_array.Fill(pad_value);
@@ -249,7 +275,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<uint8>(*input), b.ConstantR0<uint8>(35),
+  b.Pad(AddParam(*input, &b), b.ConstantR0<uint8>(35),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<uint8>>(2, 3, 3, 2);
@@ -277,8 +303,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   auto ones = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   zeros->Fill(0);
   ones->Fill(1);
-  b.Select(padded, b.ConstantR4FromArray4D<int32>(*ones),
-           b.ConstantR4FromArray4D<int32>(*zeros));
+  b.Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b));
 
   auto expected = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   expected->Fill(0);
@@ -291,10 +316,12 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   ComputeAndCompareR4<int32>(&b, *expected, {});
 }
 
-XLA_TEST_F(PadTest, Large2DPad) {
+XLA_TEST_P(PadTestFloat, Large2DPad) {
   ComputationBuilder b(client_, TestName());
 
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {4, 4}), "input");
+  auto ones = MakeUnique<Array2D<float>>(4, 4);
+  ones->Fill(1.0f);
+  auto input = AddParam(*ones, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -302,25 +329,22 @@ XLA_TEST_F(PadTest, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(0.0f), padding_config);
-
-  auto ones = MakeUnique<Array2D<float>>(4, 4);
-  ones->Fill(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, AllTypes2DPad) {
+XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(0.0f);
+  auto input = AddParam(*operand, &b);
+
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(7);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(5);
@@ -328,20 +352,14 @@ XLA_TEST_F(PadTest, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, b.ConstantR0<float>(3.14f), padding_config);
-
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(0.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec{0.0001});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, High2DPad) {
+XLA_TEST_P(PadTestFloat, High2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -349,8 +367,9 @@ XLA_TEST_F(PadTest, High2DPad) {
   constexpr int64 low_padding = 0;
   int64 high_padding[2] = {5, 7};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(low_padding);
@@ -359,20 +378,15 @@ XLA_TEST_F(PadTest, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativePadding2D) {
+XLA_TEST_P(PadTestFloat, NegativePadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -380,8 +394,9 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
   int64 low_padding[2] = {-1, -2};
   int64 high_padding[2] = {-3, 4};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -391,20 +406,15 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
+XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 8;
@@ -412,8 +422,9 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
   int64 low_padding[2] = {4, -1};
   int64 high_padding[2] = {-2, -4};
   int64 interior_padding[2] = {1, 2};
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -423,44 +434,40 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 // Regression test for b/31827337.
-XLA_TEST_F(PadTest, ReducePad) {
+XLA_TEST_P(PadTestFloat, ReducePad) {
   ComputationBuilder b(client_, TestName());
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2, 2}), "input");
+  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
+  ones->Fill(1.0);
+  auto input = AddParam(*ones, &b);
 
-  Computation add_f32 = CreateScalarAddComputation(F32, &b);
-  auto reduce = b.Reduce(input, b.ConstantR0<float>(0.0), add_f32, {0});
+  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  auto reduce =
+      b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto pad = b.Pad(reduce, b.ConstantR0<float>(0.0), padding_config);
-
-  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
-  ones->Fill(1.0);
-  auto input_literal = Literal::CreateR4FromArray4D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{0.0, 0.0}, {0.0, 0.0}}});
-  ComputeAndCompareR3<float>(&b, expected, {input_data.get()});
+  ComputeAndCompareR3<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
+INSTANTIATE_TEST_CASE_P(PadTestFloatInstantiation, PadTestFloat,
+                        ::testing::ValuesIn(use_bfloat16_params));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 4756ba09689..dc7ce3253ce 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -249,7 +249,9 @@ INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest,
 // ReducePrecisionInsertion passes.
 class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -276,7 +278,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
   ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -300,7 +304,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -322,7 +328,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -345,7 +353,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index a766fa2db0e..50d7b5074d2 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -494,6 +494,26 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
                              ErrorSpec(0.01, 1e-4));
 }
 
+// Test that algebraic simplifier does not incorrectly fold a transpose into a
+// reduction operation.
+XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
+  ComputationBuilder builder(client_, TestName());
+  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
+  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
+  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
+  ComputationDataHandle transpose =
+      builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  ComputationDataHandle reduce =
+      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
+                          MakeFakeLiteral(input_shape));
+
+  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
+                    ErrorSpec(0.01, 1e-4));
+}
+
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index debf2d2d317..4da6ee91607 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -737,7 +737,61 @@ XLA_TEST_F(ScalarComputationsTest, PowScalar) {
   ComputeAndCompareR0<float>(&builder, 8.0, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(5),   // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, 3, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(2),   // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, 2, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(-5),  // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, -1, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(5),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 3, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(2),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 2, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(0),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 1, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(5.0f),   // The operand to be clamped.
@@ -746,7 +800,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) {
   ComputeAndCompareR0<float>(&builder, 3.0, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(2.5f),   // The operand to be clamped.
@@ -755,7 +809,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
   ComputeAndCompareR0<float>(&builder, 2.5, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarLow) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(-5.0f),  // The operand to be clamped.
@@ -852,5 +906,12 @@ XLA_TEST_F(ScalarComputationsTest, SqrtF320) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(ScalarComputationsTest, RoundScalar) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Round(builder.ConstantR0<float>(1.4f));
+
+  ComputeAndCompareR0<float>(&builder, 1.0f, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0e90a323583..b060fb13b14 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -24,51 +24,127 @@ namespace xla {
 namespace {
 
 template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal) {
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
-  std::minstd_rand0 engine;
-  // Create uniform numbers between 1 and 1.125 ot avoid creating denormal
+  // Create uniform numbers between 1 and 1.125 to avoid creating denormal
   // numbers.
   std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
-        // Generate a random uniforma number from -0.0625 and 0.0625 and bias it
-        // with  a position dependent number with mean 0.037109375. These number
+        // Generate a random uniform number from -0.0625 and 0.0625 and bias it
+        // with a position dependent number with mean 0.037109375. These number
         // should allow for long chains of accumulation without being too close
-        // to zero or to large to accumulate all numbers accurately.
-        return (generator(engine) - 1.0625) +
-               static_cast<FloatT>(Product(indices) % 113 - 47) /
-                   static_cast<FloatT>(256.0f);
+        // to zero or too large to accumulate all numbers accurately. Only do
+        // this for large literals where the number of elements is much greater
+        // than 47 otherwise only negative values are produced.
+        //
+        // The value is positionally biased using a product of the indices. Add
+        // one to each index value to avoid collapsing to zero if any of the
+        // indices are zero.
+        int64 index_product = 1;
+        for (int64 i : indices) {
+          index_product *= (1 + i);
+        }
+        const int64 negative_bias = should_index_bias ? 47 : 0;
+        FloatT index_bias =
+            static_cast<FloatT>(index_product % 113 - negative_bias) /
+            static_cast<FloatT>(256.0f);
+        return (generator(*engine) - 1.0625) + index_bias;
       }));
 }
 
 // The standard library does not have a case for bfloat16, unsurprisingly, so we
 // handle that one specially.
 template <>
-void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal) {
+void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
+                                                   std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(), BF16);
-  std::minstd_rand0 engine;
   std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
   TF_CHECK_OK(literal->Populate<bfloat16>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return static_cast<bfloat16>(generator(engine));
+        return static_cast<bfloat16>(generator(*engine));
       }));
 }
 
 template <typename IntT>
-void PopulateWithRandomIntegralData(Literal* literal) {
+void PopulateWithRandomIntegralData(Literal* literal,
+                                    std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
-  std::minstd_rand0 engine;
   std::uniform_int_distribution<IntT> generator(
       std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
   TF_CHECK_OK(literal->Populate<IntT>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(engine);
+        return generator(*engine);
       }));
 }
 
+// Similar to MakeFakeLiteral but takes a random number generator engine to
+// enable reusing the engine across randomly generated literals.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
+    const Shape& shape, std::minstd_rand0* engine) {
+  if (ShapeUtil::IsTuple(shape)) {
+    std::vector<std::unique_ptr<Literal>> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
+                          MakeFakeLiteralInternal(element_shape, engine));
+      elements.push_back(std::move(element));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  }
+  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  switch (shape.element_type()) {
+    case BF16:
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
+      break;
+    case F32:
+      PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
+      break;
+    case F64:
+      PopulateWithRandomFloatingPointData<double>(literal.get(), engine);
+      break;
+    case S8:
+      PopulateWithRandomIntegralData<int8>(literal.get(), engine);
+      break;
+    case U8:
+      PopulateWithRandomIntegralData<uint8>(literal.get(), engine);
+      break;
+    case S16:
+      PopulateWithRandomIntegralData<int16>(literal.get(), engine);
+      break;
+    case U16:
+      PopulateWithRandomIntegralData<uint16>(literal.get(), engine);
+      break;
+    case S32:
+      PopulateWithRandomIntegralData<int32>(literal.get(), engine);
+      break;
+    case U32:
+      PopulateWithRandomIntegralData<uint32>(literal.get(), engine);
+      break;
+    case S64:
+      PopulateWithRandomIntegralData<int64>(literal.get(), engine);
+      break;
+    case U64:
+      PopulateWithRandomIntegralData<uint64>(literal.get(), engine);
+      break;
+    case PRED: {
+      std::uniform_int_distribution<int> generator(0, 1);
+      TF_CHECK_OK(literal->Populate<bool>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(*engine);
+          }));
+      break;
+    }
+    default:
+      return Unimplemented("Unsupported type for fake literal generation: %s",
+                           ShapeUtil::HumanString(shape).c_str());
+  }
+  return std::move(literal);
+}
+
 // Matches binary addition computations.
 bool LooksLikeSum(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
@@ -95,15 +171,15 @@ bool NeedsZeroInitValue(const HloUse& use) {
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
 std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
-    const Shape& input_shape, const Shape& slice_shape) {
+    const Shape& input_shape, const Shape& slice_shape,
+    std::minstd_rand0* engine) {
   const int64 rank = ShapeUtil::Rank(input_shape);
   std::vector<int32> start_indices(rank);
-  std::minstd_rand0 engine;
   for (int i = 0; i < rank; ++i) {
     const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
                               ShapeUtil::GetDimension(slice_shape, i);
     std::uniform_int_distribution<int32> generator(0, upper_bound);
-    start_indices[i] = generator(engine);
+    start_indices[i] = generator(*engine);
   }
   return Literal::CreateR1<int32>(start_indices);
 }
@@ -150,7 +226,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
 // zero in the case of init_values for reductions).
 StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
-    const HloInstruction& param) {
+    const HloInstruction& param, std::minstd_rand0* engine) {
   HloInstruction* needs_index = nullptr;
   HloInstruction* needs_zero = nullptr;
   for (HloInstruction* use : constrained_uses) {
@@ -185,93 +261,39 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
   }
   if (needs_index != nullptr) {
     return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
-                                           needs_index->shape());
+                                           needs_index->shape(), engine);
   } else if (needs_zero != nullptr) {
     return Literal::CreateFromShape(param.shape());
   } else {
-    return MakeFakeLiteral(param.shape());
+    return MakeFakeLiteralInternal(param.shape(), engine);
   }
 }
 
 // Given a module entry parameter, use the dataflow analysis to see if a
 // special case literal must be created, or if we can generate fake data.
 StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
-    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param,
+    std::minstd_rand0* engine) {
   const auto constrained_uses = FindConstrainedUses(dataflow, param);
-  return CreateLiteralForConstrainedUses(constrained_uses, param);
+  return CreateLiteralForConstrainedUses(constrained_uses, param, engine);
 }
 
 }  // namespace
 
 StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
-    std::vector<std::unique_ptr<Literal>> elements;
-    for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteral(element_shape));
-      elements.push_back(std::move(element));
-    }
-    return Literal::MakeTupleOwned(std::move(elements));
-  }
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-  switch (shape.element_type()) {
-    case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(literal.get());
-      break;
-    case F32:
-      PopulateWithRandomFloatingPointData<float>(literal.get());
-      break;
-    case F64:
-      PopulateWithRandomFloatingPointData<double>(literal.get());
-      break;
-    case S8:
-      PopulateWithRandomIntegralData<int8>(literal.get());
-      break;
-    case U8:
-      PopulateWithRandomIntegralData<uint8>(literal.get());
-      break;
-    case S16:
-      PopulateWithRandomIntegralData<int16>(literal.get());
-      break;
-    case U16:
-      PopulateWithRandomIntegralData<uint16>(literal.get());
-      break;
-    case S32:
-      PopulateWithRandomIntegralData<int32>(literal.get());
-      break;
-    case U32:
-      PopulateWithRandomIntegralData<uint32>(literal.get());
-      break;
-    case S64:
-      PopulateWithRandomIntegralData<int64>(literal.get());
-      break;
-    case U64:
-      PopulateWithRandomIntegralData<uint64>(literal.get());
-      break;
-    case PRED: {
-      std::uniform_int_distribution<int> generator(0, 1);
-      std::minstd_rand0 engine;
-      TF_CHECK_OK(literal->Populate<bool>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    default:
-      return Unimplemented("Unsupported type for fake literal generation: %s",
-                           ShapeUtil::HumanString(shape).c_str());
-  }
-  return std::move(literal);
+  std::minstd_rand0 engine;
+  return MakeFakeLiteralInternal(shape, &engine);
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
     HloModule* const module) {
   TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(module));
   const auto params = module->entry_computation()->parameter_instructions();
+  std::minstd_rand0 engine;
   std::vector<std::unique_ptr<Literal>> arguments(params.size());
   for (int i = 0; i < params.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(arguments[i],
-                        MakeConstrainedArgument(*dataflow, *params[i]));
+    TF_ASSIGN_OR_RETURN(
+        arguments[i], MakeConstrainedArgument(*dataflow, *params[i], &engine));
   }
   return std::move(arguments);
 }
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 5ede37b8737..b82f1c81c84 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -85,10 +85,12 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     for (int i = 0; i < program_shape->parameters_size(); ++i) {
       layouts.push_back(&program_shape->parameters(i));
     }
+    ExecutableBuildOptions build_options;
+    build_options.set_device_ordinal(0);
+    build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
         local_service->CompileExecutable(computation.handle(), layouts,
-                                         &program_shape->result(),
-                                         /*device_ordinal=*/0);
+                                         build_options);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 24417a0cb82..05c0fdf97d2 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -60,10 +60,13 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       for (int i = 0; i < program_shape->parameters_size(); ++i) {
         layouts.push_back(&program_shape->parameters(i));
       }
+
+      ExecutableBuildOptions build_options;
+      build_options.set_device_ordinal(0);
+      build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
           local_service->CompileExecutable(computation.handle(), layouts,
-                                           &program_shape->result(),
-                                           /*device_ordinal=*/0);
+                                           build_options);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index 4e02e17db65..8460ae3e499 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -19,7 +19,7 @@ limitations under the License.
 //
 // Reads one serilized Hlo module, convert it into JSON format and dump into
 // some output directory. some_binaray_proto is obtained by serializing Hlo
-// module to disk using --xla_dump_hlo_proto_to debug optoin.
+// module to disk using --xla_dump_optimized_hlo_proto_to debug option.
 
 #include <stdio.h>
 #include <string>
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 42e7f91f26f..d9c4d094b8e 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -2173,7 +2173,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 //
 //  {[2:3:4], [5:6:7], [8:9]}
 //
-// The the parsed result will be:
+// The parsed result will be:
 //
 //  {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}}
 //
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index b0209050350..1f0c626bbb2 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -339,7 +339,7 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 
 string SanitizeFileName(string file_name) {
   for (char& c : file_name) {
-    if (c == '/' || c == '\\' || c == '[' || c == ']') {
+    if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
       c = '_';
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 4bc2d632cd8..08df5b12b3a 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -217,6 +217,24 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 // Passed-varargs variant of the InvalidArgument factory above.
 Status InvalidArgumentV(const char* format, va_list args);
 
+template <typename... Args>
+Status UnimplementedStrCat(Args&&... concat) {
+  return Unimplemented(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status InternalErrorStrCat(Args&&... concat) {
+  return InternalError(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status ResourceExhaustedStrCat(Args&&... concat) {
+  return ResourceExhausted(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
 // Splits the lines of the original, replaces leading whitespace with the prefix
 // given by "indentation", and returns the string joined by newlines again. As a
 // side effect, any additional trailing whitespace is removed.
@@ -342,7 +360,7 @@ T CeilOfRatio(T dividend, T divisor) {
 }
 
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
-// then multiplying by the divisor. For example: RoundUpToMultiple(13, 8) => 16
+// then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
 T RoundUpToNearest(T value, T divisor) {
   return CeilOfRatio(value, divisor) * divisor;
@@ -350,7 +368,7 @@ T RoundUpToNearest(T value, T divisor) {
 
 // Rounds the value down to a multiple of the divisor by first calling
 // FloorOfRatio then multiplying by the divisor. For example:
-// RoundUpToMultiple(13, 8) => 8
+// RoundDownToNearest(13, 8) => 8
 template <typename T>
 T RoundDownToNearest(T value, T divisor) {
   return FloorOfRatio(value, divisor) * divisor;
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index e1ed08c8480..56162ab44e2 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -82,8 +82,9 @@ message DebugOptions {
   // Dump all HLO modules as text into the provided directory path.
   string xla_generate_hlo_text_to = 7;
 
-  // Dump compilation artifacts in binary proto into this directory.
-  string xla_dump_hlo_proto_to = 8;
+  // Dump Hlo after all hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_optimized_hlo_proto_to = 8;
 
   // Instrument the computation to collect per-HLO cycle counts.
   bool xla_hlo_profile = 9;
@@ -179,9 +180,13 @@ message DebugOptions {
   // ops.
   bool xla_gpu_use_cudnn_batchnorm = 94;
 
-  // Dump compilation artifacts, before hlo passes are executed, in binary proto
-  // into this directory.
-  string xla_dump_prepass_hlo_proto_to = 95;
+  // Dump HLO before any hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_unoptimized_hlo_proto_to = 95;
+
+  // Dump HLO after each pass as an HloProto in binary file format into this
+  // directory.
+  string xla_dump_per_pass_hlo_proto_to = 96;
 
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index e04ce01f437..3e9664a0187 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -25,6 +25,7 @@ py_library(
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/boosted_trees:init_py",
         "//tensorflow/contrib/cloud:cloud_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_ops_py",
         "//tensorflow/contrib/compiler:compiler_py",
@@ -49,6 +50,7 @@ py_library(
         "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/kafka",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/kfac",
@@ -77,6 +79,7 @@ py_library(
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
+        "//tensorflow/contrib/py2tf",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
@@ -143,6 +146,7 @@ cc_library(
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
+        "//tensorflow/contrib/kafka:kafka_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index dc5b9fb8874..e51e3f747b5 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -194,6 +194,13 @@ public class TensorFlowInferenceInterface {
    * @param outputNames A list of output nodes which should be filled by the inference pass.
    */
   public void run(String[] outputNames, boolean enableStats) {
+    run(outputNames, enableStats, new String[] {});
+  }
+
+  /**
+   * An overloaded version of runInference that allows supplying targetNodeNames as well
+   */
+  public void run(String[] outputNames, boolean enableStats, String[] targetNodeNames) {
     // Release any Tensors from the previous run calls.
     closeFetches();
 
@@ -204,6 +211,11 @@ public class TensorFlowInferenceInterface {
       runner.fetch(tid.name, tid.outputIndex);
     }
 
+	// Add targets.
+    for (String t : targetNodeNames) {
+      runner.addTarget(t);
+    }
+
     // Run the session.
     try {
       if (enableStats) {
diff --git a/tensorflow/contrib/android/jni/run_stats_jni.cc b/tensorflow/contrib/android/jni/run_stats_jni.cc
index 119fa9cd2c3..707853b59be 100644
--- a/tensorflow/contrib/android/jni/run_stats_jni.cc
+++ b/tensorflow/contrib/android/jni/run_stats_jni.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/stat_summarizer.h"
 
-using tensorflow::StatSummarizer;
 using tensorflow::RunMetadata;
+using tensorflow::StatSummarizer;
 
 namespace {
 StatSummarizer* requireHandle(JNIEnv* env, jlong handle) {
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 11c3c037c4e..6e0f0a05726 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -217,6 +217,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_seed",
     ],
+    tags = ["notsan"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index cbc66b6dc13..d244d2f4f53 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -19,29 +19,36 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from scipy import special
 from scipy import stats
 
 from tensorflow.contrib.bayesflow.python.ops import hmc
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator
 
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.platform import tf_logging as logging_ops
+
+
+def _reduce_variance(x, axis=None, keepdims=False):
+  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
+  return math_ops.reduce_mean(
+      math_ops.squared_difference(x, sample_mean), axis, keepdims)
 
 
-# TODO(b/66964210): Test float16.
 class HMCTest(test.TestCase):
 
   def setUp(self):
     self._shape_param = 5.
     self._rate_param = 10.
-    self._expected_x = (special.digamma(self._shape_param)
-                        - np.log(self._rate_param))
-    self._expected_exp_x = self._shape_param / self._rate_param
 
     random_seed.set_random_seed(10003)
     np.random.seed(10003)
@@ -63,63 +70,46 @@ class HMCTest(test.TestCase):
                                self._rate_param * math_ops.exp(x),
                                event_dims)
 
-  def _log_gamma_log_prob_grad(self, x, event_dims=()):
-    """Computes log-pdf and gradient of a log-gamma random variable.
-
-    Args:
-      x: Value of the random variable.
-      event_dims: Dimensions not to treat as independent. Default is (),
-        i.e., all dimensions are independent.
-
-    Returns:
-      log_prob: The log-pdf up to a normalizing constant.
-      grad: The gradient of the log-pdf with respect to x.
-    """
-    return (math_ops.reduce_sum(self._shape_param * x -
-                                self._rate_param * math_ops.exp(x),
-                                event_dims),
-            self._shape_param - self._rate_param * math_ops.exp(x))
-
-  def _n_event_dims(self, x_shape, event_dims):
-    return np.prod([int(x_shape[i]) for i in event_dims])
-
-  def _integrator_conserves_energy(self, x, event_dims, sess,
+  def _integrator_conserves_energy(self, x, independent_chain_ndims, sess,
                                    feed_dict=None):
-    def potential_and_grad(x):
-      log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims)
-      return -log_prob, -grad
-
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
+    hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps")
 
     if feed_dict is None:
       feed_dict = {}
     feed_dict[hmc_lf_steps] = 1000
 
+    event_dims = math_ops.range(independent_chain_ndims,
+                                array_ops.rank(x))
+
     m = random_ops.random_normal(array_ops.shape(x))
-    potential_0, grad_0 = potential_and_grad(x)
-    old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m,
-                                                         event_dims)
+    log_prob_0 = self._log_gamma_log_prob(x, event_dims)
+    grad_0 = gradients_ops.gradients(log_prob_0, x)
+    old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims)
 
-    _, new_m, potential_1, _ = (
-        hmc.leapfrog_integrator(step_size, hmc_lf_steps, x,
-                                m, potential_and_grad, grad_0))
+    new_m, _, log_prob_1, _ = _leapfrog_integrator(
+        current_momentums=[m],
+        target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims),
+        current_state_parts=[x],
+        step_sizes=[step_size],
+        num_leapfrog_steps=hmc_lf_steps,
+        current_target_log_prob=log_prob_0,
+        current_grads_target_log_prob=grad_0)
+    new_m = new_m[0]
 
-    new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
+    new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
                                                          event_dims)
 
     x_shape = sess.run(x, feed_dict).shape
-    n_event_dims = self._n_event_dims(x_shape, event_dims)
-    feed_dict[step_size] = 0.1 / n_event_dims
-    old_energy_val, new_energy_val = sess.run([old_energy, new_energy],
-                                              feed_dict)
-    logging.vlog(1, 'average energy change: {}'.format(
-        abs(old_energy_val - new_energy_val).mean()))
+    event_size = np.prod(x_shape[independent_chain_ndims:])
+    feed_dict[step_size] = 0.1 / event_size
+    old_energy_, new_energy_ = sess.run([old_energy, new_energy],
+                                        feed_dict)
+    logging_ops.vlog(1, "average energy relative change: {}".format(
+        (1. - new_energy_ / old_energy_).mean()))
+    self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02)
 
-    self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool),
-                        abs(old_energy_val - new_energy_val) < 1.)
-
-  def _integrator_conserves_energy_wrapper(self, event_dims):
+  def _integrator_conserves_energy_wrapper(self, independent_chain_ndims):
     """Tests the long-term energy conservation of the leapfrog integrator.
 
     The leapfrog integrator is symplectic, so for sufficiently small step
@@ -127,135 +117,167 @@ class HMCTest(test.TestCase):
     the energy of the system blowing up or collapsing.
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._integrator_conserves_energy(x_ph, event_dims, sess, feed_dict)
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._integrator_conserves_energy(x_ph, independent_chain_ndims,
+                                        sess, feed_dict)
 
   def testIntegratorEnergyConservationNullShape(self):
-    self._integrator_conserves_energy_wrapper([])
+    self._integrator_conserves_energy_wrapper(0)
 
   def testIntegratorEnergyConservation1(self):
-    self._integrator_conserves_energy_wrapper([1])
+    self._integrator_conserves_energy_wrapper(1)
 
   def testIntegratorEnergyConservation2(self):
-    self._integrator_conserves_energy_wrapper([2])
+    self._integrator_conserves_energy_wrapper(2)
 
-  def testIntegratorEnergyConservation12(self):
-    self._integrator_conserves_energy_wrapper([1, 2])
+  def testIntegratorEnergyConservation3(self):
+    self._integrator_conserves_energy_wrapper(3)
 
-  def testIntegratorEnergyConservation012(self):
-    self._integrator_conserves_energy_wrapper([0, 1, 2])
-
-  def _chain_gets_correct_expectations(self, x, event_dims, sess,
-                                       feed_dict=None):
+  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
+                                       sess, feed_dict=None):
     def log_gamma_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims,
+                                  array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
-    hmc_n_steps = array_ops.placeholder(np.int32, [], name='hmc_n_steps')
+    num_results = array_ops.placeholder(
+        np.int32, [], name="num_results")
+    step_size = array_ops.placeholder(
+        np.float32, [], name="step_size")
+    num_leapfrog_steps = array_ops.placeholder(
+        np.int32, [], name="num_leapfrog_steps")
 
     if feed_dict is None:
       feed_dict = {}
-    feed_dict.update({step_size: 0.1,
-                      hmc_lf_steps: 2,
-                      hmc_n_steps: 300})
+    feed_dict.update({num_results: 150,
+                      step_size: 0.1,
+                      num_leapfrog_steps: 2})
 
-    sample_chain, acceptance_prob_chain = hmc.chain([hmc_n_steps],
-                                                    step_size,
-                                                    hmc_lf_steps,
-                                                    x, log_gamma_log_prob,
-                                                    event_dims)
+    samples, kernel_results = hmc.sample_chain(
+        num_results=num_results,
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=x,
+        step_size=step_size,
+        num_leapfrog_steps=num_leapfrog_steps,
+        num_burnin_steps=150,
+        seed=42)
 
-    acceptance_probs, samples = sess.run([acceptance_prob_chain, sample_chain],
-                                         feed_dict)
-    samples = samples[feed_dict[hmc_n_steps] // 2:]
-    expected_x_est = samples.mean()
-    expected_exp_x_est = np.exp(samples).mean()
+    expected_x = (math_ops.digamma(self._shape_param)
+                  - np.log(self._rate_param))
 
-    logging.vlog(1, 'True      E[x, exp(x)]: {}\t{}'.format(
-        self._expected_x, self._expected_exp_x))
-    logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format(
-        expected_x_est, expected_exp_x_est))
-    self.assertNear(expected_x_est, self._expected_x, 2e-2)
-    self.assertNear(expected_exp_x_est, self._expected_exp_x, 2e-2)
-    self.assertTrue((acceptance_probs > 0.5).all())
-    self.assertTrue((acceptance_probs <= 1.0).all())
+    expected_exp_x = self._shape_param / self._rate_param
 
-  def _chain_gets_correct_expectations_wrapper(self, event_dims):
+    acceptance_probs_, samples_, expected_x_ = sess.run(
+        [kernel_results.acceptance_probs, samples, expected_x],
+        feed_dict)
+
+    actual_x = samples_.mean()
+    actual_exp_x = np.exp(samples_).mean()
+
+    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
+        expected_x_, expected_exp_x))
+    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
+        actual_x, actual_exp_x))
+    self.assertNear(actual_x, expected_x_, 2e-2)
+    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
+    self.assertTrue((acceptance_probs_ > 0.5).all())
+    self.assertTrue((acceptance_probs_ <= 1.0).all())
+
+  def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims):
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._chain_gets_correct_expectations(x_ph, event_dims, sess,
-                                            feed_dict)
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._chain_gets_correct_expectations(x_ph, independent_chain_ndims,
+                                            sess, feed_dict)
 
   def testHMCChainExpectationsNullShape(self):
-    self._chain_gets_correct_expectations_wrapper([])
+    self._chain_gets_correct_expectations_wrapper(0)
 
   def testHMCChainExpectations1(self):
-    self._chain_gets_correct_expectations_wrapper([1])
+    self._chain_gets_correct_expectations_wrapper(1)
 
   def testHMCChainExpectations2(self):
-    self._chain_gets_correct_expectations_wrapper([2])
+    self._chain_gets_correct_expectations_wrapper(2)
 
-  def testHMCChainExpectations12(self):
-    self._chain_gets_correct_expectations_wrapper([1, 2])
-
-  def _kernel_leaves_target_invariant(self, initial_draws, event_dims,
+  def _kernel_leaves_target_invariant(self, initial_draws,
+                                      independent_chain_ndims,
                                       sess, feed_dict=None):
     def log_gamma_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     def fake_log_prob(x):
       """Cooled version of the target distribution."""
       return 1.1 * log_gamma_log_prob(x)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
 
     if feed_dict is None:
       feed_dict = {}
 
     feed_dict[step_size] = 0.4
 
-    sample, acceptance_probs, _, _ = hmc.kernel(step_size, 5, initial_draws,
-                                                log_gamma_log_prob, event_dims)
-    bad_sample, bad_acceptance_probs, _, _ = hmc.kernel(
-        step_size, 5, initial_draws, fake_log_prob, event_dims)
-    (acceptance_probs_val, bad_acceptance_probs_val, initial_draws_val,
-     updated_draws_val, fake_draws_val) = sess.run([acceptance_probs,
-                                                    bad_acceptance_probs,
-                                                    initial_draws, sample,
-                                                    bad_sample], feed_dict)
+    sample, kernel_results = hmc.kernel(
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=43)
+
+    bad_sample, bad_kernel_results = hmc.kernel(
+        target_log_prob_fn=fake_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=44)
+
+    [
+        acceptance_probs_,
+        bad_acceptance_probs_,
+        initial_draws_,
+        updated_draws_,
+        fake_draws_,
+    ] = sess.run([
+        kernel_results.acceptance_probs,
+        bad_kernel_results.acceptance_probs,
+        initial_draws,
+        sample,
+        bad_sample,
+    ], feed_dict)
+
     # Confirm step size is small enough that we usually accept.
-    self.assertGreater(acceptance_probs_val.mean(), 0.5)
-    self.assertGreater(bad_acceptance_probs_val.mean(), 0.5)
+    self.assertGreater(acceptance_probs_.mean(), 0.5)
+    self.assertGreater(bad_acceptance_probs_.mean(), 0.5)
+
     # Confirm step size is large enough that we sometimes reject.
-    self.assertLess(acceptance_probs_val.mean(), 0.99)
-    self.assertLess(bad_acceptance_probs_val.mean(), 0.99)
-    _, ks_p_value_true = stats.ks_2samp(initial_draws_val.flatten(),
-                                        updated_draws_val.flatten())
-    _, ks_p_value_fake = stats.ks_2samp(initial_draws_val.flatten(),
-                                        fake_draws_val.flatten())
-    logging.vlog(1, 'acceptance rate for true target: {}'.format(
-        acceptance_probs_val.mean()))
-    logging.vlog(1, 'acceptance rate for fake target: {}'.format(
-        bad_acceptance_probs_val.mean()))
-    logging.vlog(1, 'K-S p-value for true target: {}'.format(ks_p_value_true))
-    logging.vlog(1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake))
+    self.assertLess(acceptance_probs_.mean(), 0.99)
+    self.assertLess(bad_acceptance_probs_.mean(), 0.99)
+
+    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
+                                        updated_draws_.flatten())
+    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
+                                        fake_draws_.flatten())
+
+    logging_ops.vlog(1, "acceptance rate for true target: {}".format(
+        acceptance_probs_.mean()))
+    logging_ops.vlog(1, "acceptance rate for fake target: {}".format(
+        bad_acceptance_probs_.mean()))
+    logging_ops.vlog(1, "K-S p-value for true target: {}".format(
+        ks_p_value_true))
+    logging_ops.vlog(1, "K-S p-value for fake target: {}".format(
+        ks_p_value_fake))
     # Make sure that the MCMC update hasn't changed the empirical CDF much.
     self.assertGreater(ks_p_value_true, 1e-3)
     # Confirm that targeting the wrong distribution does
     # significantly change the empirical CDF.
     self.assertLess(ks_p_value_fake, 1e-6)
 
-  def _kernel_leaves_target_invariant_wrapper(self, event_dims):
+  def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims):
     """Tests that the kernel leaves the target distribution invariant.
 
     Draws some independent samples from the target distribution,
@@ -267,86 +289,116 @@ class HMCTest(test.TestCase):
     does change the target distribution. (And that we can detect that.)
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
     with self.test_session() as sess:
       initial_draws = np.log(np.random.gamma(self._shape_param,
                                              size=[50000, 2, 2]))
       initial_draws -= np.log(self._rate_param)
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
 
       feed_dict = {x_ph: initial_draws}
 
-      self._kernel_leaves_target_invariant(x_ph, event_dims, sess,
-                                           feed_dict)
-
-  def testKernelLeavesTargetInvariantNullShape(self):
-    self._kernel_leaves_target_invariant_wrapper([])
+      self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims,
+                                           sess, feed_dict)
 
   def testKernelLeavesTargetInvariant1(self):
-    self._kernel_leaves_target_invariant_wrapper([1])
+    self._kernel_leaves_target_invariant_wrapper(1)
 
   def testKernelLeavesTargetInvariant2(self):
-    self._kernel_leaves_target_invariant_wrapper([2])
+    self._kernel_leaves_target_invariant_wrapper(2)
 
-  def testKernelLeavesTargetInvariant12(self):
-    self._kernel_leaves_target_invariant_wrapper([1, 2])
+  def testKernelLeavesTargetInvariant3(self):
+    self._kernel_leaves_target_invariant_wrapper(3)
 
-  def _ais_gets_correct_log_normalizer(self, init, event_dims, sess,
-                                       feed_dict=None):
+  def _ais_gets_correct_log_normalizer(self, init, independent_chain_ndims,
+                                       sess, feed_dict=None):
     def proposal_log_prob(x):
-      return math_ops.reduce_sum(-0.5 * x * x - 0.5 * np.log(2*np.pi),
-                                 event_dims)
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+      return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
+                                        axis=event_dims)
 
     def target_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     if feed_dict is None:
       feed_dict = {}
 
-    w, _, _ = hmc.ais_chain(200, 0.5, 2, init, target_log_prob,
-                            proposal_log_prob, event_dims)
+    num_steps = 200
 
-    w_val = sess.run(w, feed_dict)
-    init_shape = sess.run(init, feed_dict).shape
-    normalizer_multiplier = np.prod([init_shape[i] for i in event_dims])
+    _, ais_weights, _ = hmc.sample_annealed_importance_chain(
+        proposal_log_prob_fn=proposal_log_prob,
+        num_steps=num_steps,
+        target_log_prob_fn=target_log_prob,
+        step_size=0.5,
+        current_state=init,
+        num_leapfrog_steps=2,
+        seed=45)
 
-    true_normalizer = -self._shape_param * np.log(self._rate_param)
-    true_normalizer += special.gammaln(self._shape_param)
-    true_normalizer *= normalizer_multiplier
+    event_shape = array_ops.shape(init)[independent_chain_ndims:]
+    event_size = math_ops.reduce_prod(event_shape)
 
-    n_weights = np.prod(w_val.shape)
-    normalized_w = np.exp(w_val - true_normalizer)
-    standard_error = np.std(normalized_w) / np.sqrt(n_weights)
-    logging.vlog(1, 'True normalizer {}, estimated {}, n_weights {}'.format(
-        true_normalizer, np.log(normalized_w.mean()) + true_normalizer,
-        n_weights))
-    self.assertNear(normalized_w.mean(), 1.0, 4.0 * standard_error)
+    log_true_normalizer = (
+        -self._shape_param * math_ops.log(self._rate_param)
+        + math_ops.lgamma(self._shape_param))
+    log_true_normalizer *= math_ops.cast(event_size, log_true_normalizer.dtype)
 
-  def _ais_gets_correct_log_normalizer_wrapper(self, event_dims):
+    log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights)
+                                - np.log(num_steps))
+
+    ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer)
+    ais_weights_size = array_ops.size(ais_weights)
+    standard_error = math_ops.sqrt(
+        _reduce_variance(ratio_estimate_true)
+        / math_ops.cast(ais_weights_size, ratio_estimate_true.dtype))
+
+    [
+        ratio_estimate_true_,
+        log_true_normalizer_,
+        log_estimated_normalizer_,
+        standard_error_,
+        ais_weights_size_,
+        event_size_,
+    ] = sess.run([
+        ratio_estimate_true,
+        log_true_normalizer,
+        log_estimated_normalizer,
+        standard_error,
+        ais_weights_size,
+        event_size,
+    ], feed_dict)
+
+    logging_ops.vlog(1, "        log_true_normalizer: {}\n"
+                        "   log_estimated_normalizer: {}\n"
+                        "           ais_weights_size: {}\n"
+                        "                 event_size: {}\n".format(
+                            log_true_normalizer_,
+                            log_estimated_normalizer_,
+                            ais_weights_size_,
+                            event_size_))
+    self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_)
+
+  def _ais_gets_correct_log_normalizer_wrapper(self, independent_chain_ndims):
     """Tests that AIS yields reasonable estimates of normalizers."""
     with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
       initial_draws = np.random.normal(size=[30, 2, 1])
-      feed_dict = {x_ph: initial_draws}
-
-      self._ais_gets_correct_log_normalizer(x_ph, event_dims, sess,
-                                            feed_dict)
-
-  def testAISNullShape(self):
-    self._ais_gets_correct_log_normalizer_wrapper([])
+      self._ais_gets_correct_log_normalizer(
+          x_ph,
+          independent_chain_ndims,
+          sess,
+          feed_dict={x_ph: initial_draws})
 
   def testAIS1(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1])
+    self._ais_gets_correct_log_normalizer_wrapper(1)
 
   def testAIS2(self):
-    self._ais_gets_correct_log_normalizer_wrapper([2])
+    self._ais_gets_correct_log_normalizer_wrapper(2)
 
-  def testAIS12(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1, 2])
+  def testAIS3(self):
+    self._ais_gets_correct_log_normalizer_wrapper(3)
 
   def testNanRejection(self):
     """Tests that an update that yields NaN potentials gets rejected.
@@ -359,24 +411,29 @@ class HMCTest(test.TestCase):
     """
     def _unbounded_exponential_log_prob(x):
       """An exponential distribution with log-likelihood NaN for x < 0."""
-      per_element_potentials = array_ops.where(x < 0,
-                                               np.nan * array_ops.ones_like(x),
-                                               -x)
+      per_element_potentials = array_ops.where(
+          x < 0.,
+          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
+          -x)
       return math_ops.reduce_sum(per_element_potentials)
 
     with self.test_session() as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, acceptance_probs, _, _ = hmc.kernel(
-          2., 5, initial_x, _unbounded_exponential_log_prob, [0])
-      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
-          [initial_x, updated_x, acceptance_probs])
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_unbounded_exponential_log_prob,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=46)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
 
-      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
-      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
-      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
 
-      self.assertAllEqual(initial_x_val, updated_x_val)
-      self.assertEqual(acceptance_probs_val, 0.)
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
 
   def testNanFromGradsDontPropagate(self):
     """Test that update with NaN gradients does not cause NaN in results."""
@@ -385,60 +442,195 @@ class HMCTest(test.TestCase):
 
     with self.test_session() as sess:
       initial_x = math_ops.linspace(0.01, 5, 10)
-      updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel(
-          2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0])
-      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
-          [initial_x, updated_x, acceptance_probs])
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=47)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
 
-      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
-      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
-      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
 
-      self.assertAllEqual(initial_x_val, updated_x_val)
-      self.assertEqual(acceptance_probs_val, 0.)
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
 
       self.assertAllFinite(
-          gradients_impl.gradients(updated_x, initial_x)[0].eval())
-      self.assertTrue(
-          gradients_impl.gradients(new_grad, initial_x)[0] is None)
+          gradients_ops.gradients(updated_x, initial_x)[0].eval())
+      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob, initial_x)])
+      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob,
+          kernel_results.proposed_state)])
 
       # Gradients of the acceptance probs and new log prob are not finite.
-      _ = new_log_prob  # Prevent unused arg error.
       # self.assertAllFinite(
-      #     gradients_impl.gradients(acceptance_probs, initial_x)[0].eval())
+      #     gradients_ops.gradients(acceptance_probs, initial_x)[0].eval())
       # self.assertAllFinite(
-      #     gradients_impl.gradients(new_log_prob, initial_x)[0].eval())
+      #     gradients_ops.gradients(new_log_prob, initial_x)[0].eval())
+
+  def _testChainWorksDtype(self, dtype):
+    states, kernel_results = hmc.sample_chain(
+        num_results=10,
+        target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1),
+        current_state=np.zeros(5).astype(dtype),
+        step_size=0.01,
+        num_leapfrog_steps=10,
+        seed=48)
+    with self.test_session() as sess:
+      states_, acceptance_probs_ = sess.run(
+          [states, kernel_results.acceptance_probs])
+    self.assertEqual(dtype, states_.dtype)
+    self.assertEqual(dtype, acceptance_probs_.dtype)
 
   def testChainWorksIn64Bit(self):
-    def log_prob(x):
-      return - math_ops.reduce_sum(x * x, axis=-1)
-    states, acceptance_probs = hmc.chain(
-        n_iterations=10,
-        step_size=np.float64(0.01),
-        n_leapfrog_steps=10,
-        initial_x=np.zeros(5).astype(np.float64),
-        target_log_prob_fn=log_prob,
-        event_dims=[-1])
-    with self.test_session() as sess:
-      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
-    self.assertEqual(np.float64, states_.dtype)
-    self.assertEqual(np.float64, acceptance_probs_.dtype)
+    self._testChainWorksDtype(np.float64)
 
   def testChainWorksIn16Bit(self):
-    def log_prob(x):
-      return - math_ops.reduce_sum(x * x, axis=-1)
-    states, acceptance_probs = hmc.chain(
-        n_iterations=10,
-        step_size=np.float16(0.01),
-        n_leapfrog_steps=10,
-        initial_x=np.zeros(5).astype(np.float16),
-        target_log_prob_fn=log_prob,
-        event_dims=[-1])
+    self._testChainWorksDtype(np.float16)
+
+
+class _EnergyComputationTest(object):
+
+  def testHandlesNanFromPotential(self):
     with self.test_session() as sess:
-      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
-    self.assertEqual(np.float16, states_.dtype)
-    self.assertEqual(np.float16, acceptance_probs_.dtype)
+      x = [1, np.inf, -np.inf, np.nan]
+      target_log_prob, proposed_target_log_prob = [
+          self.dtype(x.flatten()) for x in np.meshgrid(x, x)]
+      num_chains = len(target_log_prob)
+      dummy_momentums = [-1, 1]
+      momentums = [self.dtype([dummy_momentums] * num_chains)]
+      proposed_momentums = [self.dtype([dummy_momentums] * num_chains)]
+
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
+
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      self.assertAllEqual(np.ones_like(grads_).astype(np.bool),
+                          np.isfinite(grads_))
+
+  def testHandlesNanFromKinetic(self):
+    with self.test_session() as sess:
+      x = [1, np.inf, -np.inf, np.nan]
+      momentums, proposed_momentums = [
+          [np.reshape(self.dtype(x), [-1, 1])]
+          for x in np.meshgrid(x, x)]
+      num_chains = len(momentums[0])
+      target_log_prob = np.ones(num_chains, self.dtype)
+      proposed_target_log_prob = np.ones(num_chains, self.dtype)
+
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
+
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      g = grads_[0].reshape([len(x), len(x)])[:, 0]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g))
+
+      # The remaining gradients are nan because the momentum was itself nan or
+      # inf.
+      g = grads_[0].reshape([len(x), len(x)])[:, 1:]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g))
 
 
-if __name__ == '__main__':
+class EnergyComputationTest16(test.TestCase, _EnergyComputationTest):
+  dtype = np.float16
+
+
+class EnergyComputationTest32(test.TestCase, _EnergyComputationTest):
+  dtype = np.float32
+
+
+class EnergyComputationTest64(test.TestCase, _EnergyComputationTest):
+  dtype = np.float64
+
+
+class _HMCHandlesLists(object):
+
+  def testStateParts(self):
+    with self.test_session() as sess:
+      dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1))
+      dist_y = independent_lib.Independent(
+          gamma_lib.Gamma(concentration=self.dtype([1, 2]),
+                          rate=self.dtype([0.5, 0.75])),
+          reinterpreted_batch_ndims=1)
+      def target_log_prob(x, y):
+        return dist_x.log_prob(x) + dist_y.log_prob(y)
+      x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)]
+      samples, _ = hmc.sample_chain(
+          num_results=int(2e3),
+          target_log_prob_fn=target_log_prob,
+          current_state=x0,
+          step_size=0.85,
+          num_leapfrog_steps=3,
+          num_burnin_steps=int(250),
+          seed=49)
+      actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples]
+      actual_vars = [_reduce_variance(s, axis=0) for s in samples]
+      expected_means = [dist_x.mean(), dist_y.mean()]
+      expected_vars = [dist_x.variance(), dist_y.variance()]
+      [
+          actual_means_,
+          actual_vars_,
+          expected_means_,
+          expected_vars_,
+      ] = sess.run([
+          actual_means,
+          actual_vars,
+          expected_means,
+          expected_vars,
+      ])
+      self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
+      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.40)
+
+
+class HMCHandlesLists16(_HMCHandlesLists, test.TestCase):
+  dtype = np.float16
+
+
+class HMCHandlesLists32(_HMCHandlesLists, test.TestCase):
+  dtype = np.float32
+
+
+class HMCHandlesLists64(_HMCHandlesLists, test.TestCase):
+  dtype = np.float64
+
+
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
index fdc12e3b214..d44fe6529a7 100644
--- a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
@@ -31,8 +31,7 @@ __all__ = [
 ]
 
 
-def custom_gradient(fx, gx, x, axis=(),
-                    fx_gx_manually_stopped=False,
+def custom_gradient(fx, gx, x, axis=(), fx_gx_manually_stopped=False,
                     name=None):
   """Enables specifying a custom gradient.
 
@@ -43,7 +42,8 @@ def custom_gradient(fx, gx, x, axis=(),
   h(x) = x * stop_gradient(g(x)) + stop_gradient(f(x) - x * g(x))
   ```
 
-  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] = stop_gradient(g(x)).`
+  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] =
+  stop_gradient(g(x)).`
 
   In addition to scalar-domain/scalar-range functions, this function also
   supports tensor-domain/scalar-range functions. However, in the latter case it
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/bayesflow/python/ops/hmc.py
index 977d42fc16b..7fd5652c5c3 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
-"""
+"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,11 +23,9 @@ from tensorflow.contrib.bayesflow.python.ops.hmc_impl import *  # pylint: disabl
 from tensorflow.python.util import all_util
 
 _allowed_symbols = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
index 5685a942e98..f7a11c21d8a 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
@@ -14,17 +14,16 @@
 # ==============================================================================
 """Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
 
-@@chain
-@@update
-@@leapfrog_integrator
-@@leapfrog_step
-@@ais_chain
+@@sample_chain
+@@sample_annealed_importance_chain
+@@kernel
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numpy as np
 
 from tensorflow.python.framework import dtypes
@@ -32,168 +31,292 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops.distributions import util as distributions_util
 
 __all__ = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 
-def _make_potential_and_grad(target_log_prob_fn):
-  def potential_and_grad(x):
-    log_prob_result = -target_log_prob_fn(x)
-    grad_result = gradients_impl.gradients(math_ops.reduce_sum(log_prob_result),
-                                           x)[0]
-    return log_prob_result, grad_result
-  return potential_and_grad
+KernelResults = collections.namedtuple(
+    "KernelResults",
+    [
+        "acceptance_probs",
+        "current_grads_target_log_prob",  # "Current result" means "accepted".
+        "current_target_log_prob",  # "Current result" means "accepted".
+        "energy_change",
+        "is_accepted",
+        "proposed_grads_target_log_prob",
+        "proposed_state",
+        "proposed_target_log_prob",
+        "random_positive",
+    ])
 
 
-def chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-          target_log_prob_fn, event_dims=(), name=None):
+def _make_dummy_kernel_results(
+    dummy_state,
+    dummy_target_log_prob,
+    dummy_grads_target_log_prob):
+  return KernelResults(
+      acceptance_probs=dummy_target_log_prob,
+      current_grads_target_log_prob=dummy_grads_target_log_prob,
+      current_target_log_prob=dummy_target_log_prob,
+      energy_change=dummy_target_log_prob,
+      is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool),
+      proposed_grads_target_log_prob=dummy_grads_target_log_prob,
+      proposed_state=dummy_state,
+      proposed_target_log_prob=dummy_target_log_prob,
+      random_positive=dummy_target_log_prob,
+  )
+
+
+def sample_chain(
+    num_results,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    num_burnin_steps=0,
+    num_steps_between_results=0,
+    seed=None,
+    current_target_log_prob=None,
+    current_grads_target_log_prob=None,
+    name=None):
   """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains.
 
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
-  algorithm that takes a series of gradient-informed steps to produce
-  a Metropolis proposal. This function samples from an HMC Markov
-  chain whose initial state is `initial_x` and whose stationary
-  distribution has log-density `target_log_prob_fn()`.
-
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `initial_x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; this is up to
+  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm
+  that takes a series of gradient-informed steps to produce a Metropolis
+  proposal. This function samples from an HMC Markov chain at `current_state`
+  and whose stationary distribution has log-unnormalized-density
   `target_log_prob_fn()`.
 
-  This function basically just wraps `hmc.kernel()` in a tf.scan() loop.
+  This function samples from multiple chains in parallel. It assumes that the
+  the leftmost dimensions of (each) `current_state` (part) index an independent
+  chain.  The function `target_log_prob_fn()` sums log-probabilities across
+  event dimensions (i.e., current state (part) rightmost dimensions). Each
+  element of the output of `target_log_prob_fn()` represents the (possibly
+  unnormalized) log-probability of the joint distribution over (all) the current
+  state (parts).
 
-  Args:
-    n_iterations: Integer number of Markov chain updates to run.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s).
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
+  The `current_state` can be represented as a single `Tensor` or a `list` of
+  `Tensors` which collectively represent the current state. When specifying a
+  `list`, one must also specify a list of `step_size`s.
 
-  Returns:
-    acceptance_probs: Tensor with the acceptance probabilities for each
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state of the Markov chain at each iteration.
-      Has shape `[n_iterations, initial_x.shape[0],...,initial_x.shape[-1]`.
+  Only one out of every `num_steps_between_samples + 1` steps is included in the
+  returned results. This "thinning" comes at a cost of reduced statistical
+  power, while reducing memory requirements and autocorrelation. For more
+  discussion see [1].
+
+  [1]: "Statistically efficient thinning of a Markov chain sampler."
+       Art B. Owen. April 2017.
+       http://statweb.stanford.edu/~owen/reports/bestthinning.pdf
 
   #### Examples:
 
-  ```python
-  # Sampling from a standard normal (note `log_joint()` is unnormalized):
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
-  ```
+  ##### Sample from a diagonal-variance Gaussian.
 
   ```python
-  # Sampling from a diagonal-variance Gaussian:
-  variances = tf.linspace(1., 3., 10)
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 / variances * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_likelihood(true_variances):
+    return tfd.MultivariateNormalDiag(
+        scale_diag=tf.sqrt(true_variances))
+
+  dims = 10
+  dtype = np.float32
+  true_variances = tf.linspace(dtype(1), dtype(3), dims)
+  likelihood = make_likelihood(true_variances)
+
+  states, kernel_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=likelihood.log_prob,
+      current_state=tf.zeros(dims),
+      step_size=0.5,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(states, axis=0)
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(states, sample_mean),
+      axis=0)
   ```
 
+  ##### Sampling from factor-analysis posteriors with known factors.
+
+  I.e.,
+
+  ```none
+  for i=1..n:
+    w[i] ~ Normal(0, eye(d))            # prior
+    x[i] ~ Normal(loc=matmul(w[i], F))  # likelihood
+  ```
+
+  where `F` denotes factors.
+
   ```python
-  # Sampling from factor-analysis posteriors with known factors W:
-  # mu[i, j] ~ Normal(0, 1)
-  # x[i] ~ Normal(matmul(mu[i], W), I)
-  def log_joint(mu, x, W):
-    prior = -0.5 * tf.reduce_sum(tf.square(mu), 1)
-    x_mean = tf.matmul(mu, W)
-    likelihood = -0.5 * tf.reduce_sum(tf.square(x - x_mean), 1)
-    return prior + likelihood
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 2,
-                                      tf.zeros([x.shape[0], W.shape[0]]),
-                                      lambda mu: log_joint(mu, x, W),
-                                      event_dims=[1])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, factors):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, factors, axes=[[0], [-1]]))
+
+  # Setup data.
+  num_weights = 10
+  num_factors = 4
+  num_chains = 100
+  dtype = np.float32
+
+  prior = make_prior(num_weights, dtype)
+  weights = prior.sample(num_chains)
+  factors = np.random.randn(num_factors, num_weights).astype(dtype)
+  x = make_likelihood(weights, factors).sample(num_chains)
+
+  def target_log_prob(w):
+    # Target joint is: `f(w) = p(w, x | factors)`.
+    return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x)
+
+  # Get `num_results` samples from `num_chains` independent chains.
+  chains_states, kernels_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target_log_prob,
+      current_state=tf.zeros([num_chains, dims], dtype),
+      step_size=0.1,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(chains_states, axis=[0, 1])
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(chains_states, sample_mean),
+      axis=[0, 1])
   ```
 
-  ```python
-  # Sampling from the posterior of a Bayesian regression model.:
+  Args:
+    num_results: Integer number of Markov chain draws.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    num_burnin_steps: Integer number of chain steps to take before starting to
+      collect results.
+      Default value: 0 (i.e., no burn-in).
+    num_steps_between_results: Integer number of chain steps between collecting
+      a result. Only one out of every `num_steps_between_samples + 1` steps is
+      included in the returned results. This "thinning" comes at a cost of
+      reduced statistical power, while reducing memory requirements and
+      autocorrelation. For more discussion see [1].
+      Default value: 0 (i.e., no subsampling).
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob` at the `current_state` and wrt
+      the `current_state`. Must have same shape as `current_state`. The only
+      reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_chain").
 
-  # Run 100 chains in parallel, each with a different initialization.
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 10, initial_beta,
-                                      log_joint_partial, event_dims=[1])
-  # Discard first halves of chains as warmup/burn-in
-  warmed_up = chain[500:]
-  # Averaging across samples within a chain and across chains
-  mean_est = tf.reduce_mean(warmed_up, [0, 1])
-  var_est = tf.reduce_mean(tf.square(warmed_up), [0, 1]) - tf.square(mean_est)
-  ```
+  Returns:
+    accepted_states: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state` but with a prepended `num_results`-size dimension.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
   """
-  with ops.name_scope(name, 'hmc_chain', [n_iterations, step_size,
-                                          n_leapfrog_steps, initial_x]):
-    initial_x = ops.convert_to_tensor(initial_x, name='initial_x')
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
+  with ops.name_scope(
+      name, "hmc_sample_chain",
+      [num_results, current_state, step_size, num_leapfrog_steps,
+       num_burnin_steps, num_steps_between_results, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_target_log_prob,
+          current_grads_target_log_prob,
+      ] = _prepare_args(
+          target_log_prob_fn, current_state, step_size,
+          current_target_log_prob, current_grads_target_log_prob)
+    def _run_chain(num_steps, current_state, seed, kernel_results):
+      """Runs the chain(s) for `num_steps`."""
+      def _loop_body(iter_, current_state, kernel_results):
+        return [iter_ + 1] + list(kernel(
+            target_log_prob_fn,
+            current_state,
+            step_size,
+            num_leapfrog_steps,
+            seed,
+            kernel_results.current_target_log_prob,
+            kernel_results.current_grads_target_log_prob))
+      return control_flow_ops.while_loop(
+          cond=lambda iter_, *args: iter_ < num_steps,
+          body=_loop_body,
+          loop_vars=[0, current_state, kernel_results])[1:]  # Lop-off "iter_".
 
-    def body(a, _):
-      updated_x, acceptance_probs, log_prob, grad = kernel(
-          step_size, n_leapfrog_steps, a[0], target_log_prob_fn, event_dims,
-          a[2], a[3])
-      return updated_x, acceptance_probs, log_prob, grad
+    def _scan_body(args_list, _):
+      """Closure which implements `tf.scan` body."""
+      current_state, kernel_results = args_list
+      return _run_chain(num_steps_between_results + 1, current_state, seed,
+                        kernel_results)
+
+    current_state, kernel_results = _run_chain(
+        num_burnin_steps,
+        current_state,
+        distributions_util.gen_new_seed(
+            seed, salt="hmc_sample_chain_burnin"),
+        _make_dummy_kernel_results(
+            current_state,
+            current_target_log_prob,
+            current_grads_target_log_prob))
 
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-    potential, grad = potential_and_grad(initial_x)
     return functional_ops.scan(
-        body, array_ops.zeros(n_iterations, dtype=initial_x.dtype),
-        (initial_x,
-         array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
-         -potential, -grad))[:2]
+        fn=_scan_body,
+        elems=array_ops.zeros(num_results, dtype=dtypes.bool),  # Dummy arg.
+        initializer=[current_state, kernel_results])
 
 
-def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-              target_log_prob_fn, proposal_log_prob_fn, event_dims=(),
-              name=None):
+def sample_annealed_importance_chain(
+    proposal_log_prob_fn,
+    num_steps,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    seed=None,
+    name=None):
   """Runs annealed importance sampling (AIS) to estimate normalizing constants.
 
-  This routine uses Hamiltonian Monte Carlo to sample from a series of
+  This function uses Hamiltonian Monte Carlo to sample from a series of
   distributions that slowly interpolates between an initial "proposal"
-  distribution
+  distribution:
 
   `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)`
 
-  and the target distribution
+  and the target distribution:
 
   `exp(target_log_prob_fn(x) - target_log_normalizer)`,
 
@@ -202,113 +325,183 @@ def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
   normalizing constants of the initial distribution and the target
   distribution:
 
-  E[exp(w)] = exp(target_log_normalizer - proposal_log_normalizer).
-
-  Args:
-    n_iterations: Integer number of Markov chain updates to run. More
-      iterations means more expense, but smoother annealing between q
-      and p, which in turn means exponentially lower variance for the
-      normalizing constant estimator.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s). Must
-      be a sample from q, or results will be incorrect.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    proposal_log_prob_fn: Python callable that returns the log density of the
-      initial distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    ais_weights: Tensor with the estimated weight(s). Has shape matching
-      `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state(s) of the Markov chain(s) the final
-      iteration. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
+  `E[exp(ais_weights)] = exp(target_log_normalizer - proposal_log_normalizer)`.
 
   #### Examples:
 
-  ```python
-  # Estimating the normalizing constant of a log-gamma distribution:
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) - 0.5 * np.log(2 * np.pi), 1)
-  def target_log_prob(x):
-    # Unnormalized log-gamma(2, 3) distribution.
-    # True normalizer is (lgamma(2) - 2 * log(3)) * x.shape[1]
-    return tf.reduce_sum(2. * x - 3. * tf.exp(x), 1)
-  # Run 100 AIS chains in parallel
-  initial_x = tf.random_normal([100, 20])
-  w, _, _ = hmc.ais_chain(1000, 0.2, 2, initial_x, target_log_prob,
-                          proposal_log_prob, event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
-  ```
+  ##### Estimate the normalizing constant of a log-gamma distribution.
 
   ```python
-  # Estimating the marginal likelihood of a Bayesian regression model:
-  base_measure = -0.5 * np.log(2 * np.pi)
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) + base_measure, 1)
-  def regression_log_joint(beta, x, y):
-    # This function returns a vector whose ith element is log p(beta[i], y | x).
-    # Each row of beta corresponds to the state of an independent Markov chain.
-    log_prior = tf.reduce_sum(-0.5 * tf.square(beta) + base_measure, 1)
-    means = tf.matmul(beta, x, transpose_b=True)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means) +
-                                   base_measure, 1)
-    return log_prior + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, x, y)
+  tfd = tf.contrib.distributions
+
   # Run 100 AIS chains in parallel
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  w, beta_samples, _ = hmc.ais_chain(1000, 0.1, 2, initial_beta,
-                                     log_joint_partial, proposal_log_prob,
-                                     event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
+  num_chains = 100
+  dims = 20
+  dtype = np.float32
+
+  proposal = tfd.MultivatiateNormalDiag(
+     loc=tf.zeros([dims], dtype=dtype))
+
+  target = tfd.TransformedDistribution(
+    distribution=tfd.Gamma(concentration=dtype(2),
+                           rate=dtype(3)),
+    bijector=tfd.bijectors.Invert(tfd.bijectors.Exp()),
+    event_shape=[dims])
+
+  chains_state, ais_weights, kernels_results = (
+      hmc.sample_annealed_importance_chain(
+          proposal_log_prob_fn=proposal.log_prob,
+          num_steps=1000,
+          target_log_prob_fn=target.log_prob,
+          step_size=0.2,
+          current_state=proposal.sample(num_chains),
+          num_leapfrog_steps=2))
+
+  log_estimated_normalizer = (tf.reduce_logsumexp(ais_weights)
+                              - np.log(num_chains))
+  log_true_normalizer = tf.lgamma(2.) - 2. * tf.log(3.)
   ```
+
+  ##### Estimate marginal likelihood of a Bayesian regression model.
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, x):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, x, axes=[[0], [-1]]))
+
+  # Run 100 AIS chains in parallel
+  num_chains = 100
+  dims = 10
+  dtype = np.float32
+
+  # Make training data.
+  x = np.random.randn(num_chains, dims).astype(dtype)
+  true_weights = np.random.randn(dims).astype(dtype)
+  y = np.dot(x, true_weights) + np.random.randn(num_chains)
+
+  # Setup model.
+  prior = make_prior(dims, dtype)
+  def target_log_prob_fn(weights):
+    return prior.log_prob(weights) + make_likelihood(weights, x).log_prob(y)
+
+  proposal = tfd.MultivariateNormalDiag(
+      loc=tf.zeros(dims, dtype))
+
+  weight_samples, ais_weights, kernel_results = (
+      hmc.sample_annealed_importance_chain(
+        num_steps=1000,
+        proposal_log_prob_fn=proposal.log_prob,
+        target_log_prob_fn=target_log_prob_fn
+        current_state=tf.zeros([num_chains, dims], dtype),
+        step_size=0.1,
+        num_leapfrog_steps=2))
+  log_normalizer_estimate = (tf.reduce_logsumexp(ais_weights)
+                             - np.log(num_chains))
+  ```
+
+  Args:
+    proposal_log_prob_fn: Python callable that returns the log density of the
+      initial distribution.
+    num_steps: Integer number of Markov chain updates to run. More
+      iterations means more expense, but smoother annealing between q
+      and p, which in turn means exponentially lower variance for the
+      normalizing constant estimator.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_annealed_importance_chain").
+
+  Returns:
+    accepted_state: `Tensor` or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at the final iteration. Has same shape as
+      input `current_state`.
+    ais_weights: Tensor with the estimated weight(s). Has shape matching
+      `target_log_prob_fn(current_state)`.
   """
-  with ops.name_scope(name, 'hmc_ais_chain',
-                      [n_iterations, step_size, n_leapfrog_steps, initial_x]):
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
+  def make_convex_combined_log_prob_fn(iter_):
+    def _fn(*args):
+      p = proposal_log_prob_fn(*args)
+      t = target_log_prob_fn(*args)
+      dtype = p.dtype.base_dtype
+      beta = (math_ops.cast(iter_ + 1, dtype)
+              / math_ops.cast(num_steps, dtype))
+      return (1. - beta) * p + beta * t
+    return _fn
 
-    beta_series = math_ops.linspace(0., 1., n_iterations+1)[1:]
-    def _body(a, beta):  # pylint: disable=missing-docstring
-      def log_prob_beta(x):
-        return ((1 - beta) * proposal_log_prob_fn(x) +
-                beta * target_log_prob_fn(x))
-      last_x = a[0]
-      w = a[2]
-      w += (1. / n_iterations) * (target_log_prob_fn(last_x) -
-                                  proposal_log_prob_fn(last_x))
-      # TODO(b/66917083): There's an opportunity for gradient reuse here.
-      updated_x, acceptance_probs, _, _ = kernel(step_size, n_leapfrog_steps,
-                                                 last_x, log_prob_beta,
-                                                 event_dims)
-      return updated_x, acceptance_probs, w
+  with ops.name_scope(
+      name, "hmc_sample_annealed_importance_chain",
+      [num_steps, current_state, step_size, num_leapfrog_steps, seed]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_log_prob,
+          current_grads_log_prob,
+      ] = _prepare_args(
+          make_convex_combined_log_prob_fn(iter_=0),
+          current_state,
+          step_size,
+          description="convex_combined_log_prob")
+    def _loop_body(iter_, ais_weights, current_state, kernel_results):
+      """Closure which implements `tf.while_loop` body."""
+      current_state_parts = (list(current_state)
+                             if _is_list_like(current_state)
+                             else [current_state])
+      ais_weights += ((target_log_prob_fn(*current_state_parts)
+                       - proposal_log_prob_fn(*current_state_parts))
+                      / math_ops.cast(num_steps, ais_weights.dtype))
+      return [iter_ + 1, ais_weights] + list(kernel(
+          make_convex_combined_log_prob_fn(iter_),
+          current_state,
+          step_size,
+          num_leapfrog_steps,
+          seed,
+          kernel_results.current_target_log_prob,
+          kernel_results.current_grads_target_log_prob))
 
-    x, acceptance_probs, w = functional_ops.scan(
-        _body, beta_series,
-        (initial_x, array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
-         array_ops.zeros(non_event_shape, dtype=initial_x.dtype)))
-  return w[-1], x[-1], acceptance_probs[-1]
+    [ais_weights, current_state, kernel_results] = control_flow_ops.while_loop(
+        cond=lambda iter_, *args: iter_ < num_steps,
+        body=_loop_body,
+        loop_vars=[
+            0,  # iter_
+            array_ops.zeros_like(current_log_prob),  # ais_weights
+            current_state,
+            _make_dummy_kernel_results(current_state,
+                                       current_log_prob,
+                                       current_grads_log_prob),
+        ])[1:]  # Lop-off "iter_".
+
+    return [current_state, ais_weights, kernel_results]
 
 
-def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
-           x_log_prob=None, x_grad=None, name=None):
+def kernel(target_log_prob_fn,
+           current_state,
+           step_size,
+           num_leapfrog_steps,
+           seed=None,
+           current_target_log_prob=None,
+           current_grads_target_log_prob=None,
+           name=None):
   """Runs one iteration of Hamiltonian Monte Carlo.
 
   Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
@@ -316,334 +509,625 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
   a Metropolis proposal. This function applies one step of HMC to
   randomly update the variable `x`.
 
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; for example, if
-  `event_dims == (1,)`, then `x[0, :]` could have a different target
-  distribution from x[1, :]. This is up to `target_log_prob_fn()`.
-
-  Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    x: Tensor containing the value(s) of the random variable(s) to update.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    x_log_prob (optional): Tensor containing the cached output of a previous
-      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    x_grad (optional): Tensor containing the cached gradient of
-      `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. This is useful for diagnosing step size problems etc. Has
-      shape matching `target_log_prob_fn(initial_x)`.
-    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
-    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
-      `updated_x`.
+  This function can update multiple chains in parallel. It assumes that all
+  leftmost dimensions of `current_state` index independent chain states (and are
+  therefore updated independently). The output of `target_log_prob_fn()` should
+  sum log-probabilities across all event dimensions. Slices along the rightmost
+  dimensions may have different target distributions; for example,
+  `current_state[0, :]` could have a different target distribution from
+  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
+  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
 
   #### Examples:
 
+  ##### Simple chain with warm-up.
+
   ```python
+  tfd = tf.contrib.distributions
+
   # Tuning acceptance rates:
+  dtype = np.float32
   target_accept_rate = 0.631
-  def target_log_prob(x):
-    # Standard normal
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  initial_x = tf.zeros([10])
-  initial_log_prob = target_log_prob(initial_x)
-  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
-  # Algorithm state
-  x = tf.Variable(initial_x, name='x')
-  step_size = tf.Variable(1., name='step_size')
-  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
-  last_grad = tf.Variable(initial_grad, name='last_grad')
-  # Compute updates
-  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
-                                                      target_log_prob,
-                                                      event_dims=[0],
-                                                      x_log_prob=last_log_prob)
-  x_update = tf.assign(x, new_x)
-  log_prob_update = tf.assign(last_log_prob, log_prob)
-  grad_update = tf.assign(last_grad, grad)
-  step_size_update = tf.assign(step_size,
-                               tf.where(acceptance_prob > target_accept_rate,
-                                        step_size * 1.01, step_size / 1.01))
-  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
-  sampling_updates = [x_update, log_prob_update, grad_update]
+  num_warmup_iter = 500
+  num_chain_iter = 500
+
+  x = tf.get_variable(name="x", initializer=dtype(1))
+  step_size = tf.get_variable(name="step_size", initializer=dtype(1))
+
+  target = tfd.Normal(loc=dtype(0), scale=dtype(1))
+
+  new_x, other_results = hmc.kernel(
+      target_log_prob_fn=target.log_prob,
+      current_state=x,
+      step_size=step_size,
+      num_leapfrog_steps=3)[:4]
+
+  x_update = x.assign(new_x)
+
+  step_size_update = step_size.assign_add(
+      step_size * tf.where(
+        other_results.acceptance_probs > target_accept_rate,
+        0.01, -0.01))
+
+  warmup = tf.group([x_update, step_size_update])
+
+  tf.global_variables_initializer().run()
+
+  sess.graph.finalize()  # No more graph building.
 
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
   # Warm up the sampler and adapt the step size
-  for i in xrange(500):
-    sess.run(adaptive_updates)
+  for _ in xrange(num_warmup_iter):
+    sess.run(warmup)
+
   # Collect samples without adapting step size
-  samples = np.zeros([500, 10])
-  for i in xrange(500):
-    x_val, _ = sess.run([new_x, sampling_updates])
-    samples[i] = x_val
+  samples = np.zeros([num_chain_iter])
+  for i in xrange(num_chain_iter):
+    _, x_, target_log_prob_, grad_ = sess.run([
+        x_update,
+        x,
+        other_results.target_log_prob,
+        other_results.grads_target_log_prob])
+    samples[i] = x_
+
+  print(samples.mean(), samples.std())
+  ```
+
+  ##### Sample from more complicated posterior.
+
+  I.e.,
+
+  ```none
+    W ~ MVN(loc=0, scale=sigma * eye(dims))
+    for i=1...num_samples:
+        X[i] ~ MVN(loc=0, scale=eye(dims))
+      eps[i] ~ Normal(loc=0, scale=1)
+        Y[i] = X[i].T * W + eps[i]
   ```
 
   ```python
-  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:
+  tfd = tf.contrib.distributions
 
-  # Problem setup
-  N = 150
-  D = 10
-  x = np.random.randn(N, D).astype(np.float32)
-  true_sigma = 0.5
-  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
-  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)
+  def make_training_data(num_samples, dims, sigma):
+    dt = np.asarray(sigma).dtype
+    zeros = tf.zeros(dims, dtype=dt)
+    x = tfd.MultivariateNormalDiag(
+        loc=zeros).sample(num_samples, seed=1)
+    w = tfd.MultivariateNormalDiag(
+        loc=zeros,
+        scale_identity_multiplier=sigma).sample(seed=2)
+    noise = tfd.Normal(
+        loc=dt(0),
+        scale=dt(1)).sample(num_samples, seed=3)
+    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
+    return y, x, w
+
+  def make_prior(sigma, dims):
+    # p(w | sigma)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros([dims], dtype=sigma.dtype),
+        scale_identity_multiplier=sigma)
+
+  def make_likelihood(x, w):
+    # p(y | x, w)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(x, w, axes=[[1], [0]]))
+
+  # Setup assumptions.
+  dtype = np.float32
+  num_samples = 150
+  dims = 10
+  num_iters = int(5e3)
+
+  true_sigma = dtype(0.5)
+  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)
+
+  # Estimate of `log(true_sigma)`.
+  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
+  sigma = tf.exp(log_sigma)
+
+  # State of the Markov chain.
+  weights = tf.get_variable(
+      name="weights",
+      initializer=np.random.randn(dims).astype(dtype))
+
+  prior = make_prior(sigma, dims)
+
+  def joint_log_prob_fn(w):
+    # f(w) = log p(w, y | x)
+    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)
+
+  weights_update = weights.assign(
+      hmc.kernel(target_log_prob_fn=joint_log_prob,
+                 current_state=weights,
+                 step_size=0.1,
+                 num_leapfrog_steps=5)[0])
+
+  with tf.control_dependencies([weights_update]):
+    loss = -prior.log_prob(weights)
 
-  def log_prior(beta, log_sigma):
-    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
-                         log_sigma)
-  def regression_log_joint(beta, log_sigma, x, y):
-    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
-    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
-    means = tf.squeeze(means)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
-    return log_prior(beta, log_sigma) + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, log_sigma, x, y)
-  # Our estimate of log(sigma)
-  log_sigma = tf.Variable(0., name='log_sigma')
-  # The state of the Markov chain
-  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
-  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
-                                 event_dims=[0])
-  beta_update = tf.assign(beta, new_beta)
   optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  with tf.control_dependencies([beta_update]):
-    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
-                                          var_list=[log_sigma])
+  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])
 
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
-  log_sigma_history = np.zeros(1000)
-  for i in xrange(1000):
-    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
-    log_sigma_history[i] = log_sigma_val
-  # Should converge to something close to true_sigma
-  plt.plot(np.exp(log_sigma_history))
+  sess.graph.finalize()  # No more graph building.
+
+  tf.global_variables_initializer().run()
+
+  sigma_history = np.zeros(num_iters, dtype)
+  weights_history = np.zeros([num_iters, dims], dtype)
+
+  for i in xrange(num_iters):
+    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
+    weights_history[i, :] = weights_
+    sigma_history[i] = sigma_
+
+  true_weights_ = sess.run(true_weights)
+
+  # Should converge to something close to true_sigma.
+  plt.plot(sigma_history);
+  plt.ylabel("sigma");
+  plt.xlabel("iteration");
   ```
-  """
-  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-    x = ops.convert_to_tensor(x, name='x')
-
-    x_shape = array_ops.shape(x)
-    m = random_ops.random_normal(x_shape, dtype=x.dtype)
-
-    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)
-
-    if (x_log_prob is not None) and (x_grad is not None):
-      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
-    else:
-      if x_log_prob is not None:
-        logging.warn('x_log_prob was provided, but x_grad was not,'
-                     ' so x_log_prob was not used.')
-      if x_grad is not None:
-        logging.warn('x_grad was provided, but x_log_prob was not,'
-                     ' so x_grad was not used.')
-      log_potential_0, grad_0 = potential_and_grad(x)
-
-    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
-        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)
-
-    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)
-
-    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
-    # Treat NaN as infinite energy (and therefore guaranteed rejection).
-    energy_change = array_ops.where(
-        math_ops.is_nan(energy_change),
-        array_ops.fill(array_ops.shape(energy_change),
-                       energy_change.dtype.as_numpy_dtype(np.inf)),
-        energy_change)
-    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
-    accepted = (
-        random_ops.random_uniform(
-            array_ops.shape(acceptance_probs), dtype=x.dtype)
-        < acceptance_probs)
-    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)
-
-    # TODO(b/65738010): This should work, but it doesn't for now.
-    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
-    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
-                                                        keep_dims=True))
-    accepted = array_ops.reshape(accepted, reduced_shape)
-    accepted = math_ops.logical_or(
-        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
-    new_x = array_ops.where(accepted, new_x, x)
-    new_grad = -array_ops.where(accepted, grad_1, grad_0)
-
-  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
-  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
-  # should be fixed.
-  return new_x, acceptance_probs, new_log_prob, new_grad
-
-
-def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum,
-                        potential_and_grad, initial_grad, name=None):
-  """Applies `n_steps` steps of the leapfrog integrator.
-
-  This just wraps `leapfrog_step()` in a `tf.while_loop()`, reusing
-  gradient computations where possible.
 
   Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    n_steps: Number of steps to run the leapfrog integrator.
-    initial_position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    initial_momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `initial_position` and returns the potential energy and its gradient at
-      that position.
-    initial_grad: Tensor with the value of the gradient of the potential energy
-      at `initial_position`.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to
+      specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `current_target_log_prob` at the `current_state`
+      and wrt the `current_state`. Must have same shape as `current_state`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
     name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_kernel").
 
   Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(initial_position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `initial_position`.
+    accepted_state: Tensor or Python list of `Tensor`s representing the state(s)
+      of the Markov chain(s) at each result step. Has same shape as
+      `current_state`.
+    acceptance_probs: Tensor with the acceptance probabilities for each
+      iteration. Has shape matching `target_log_prob_fn(current_state)`.
+    accepted_target_log_prob: `Tensor` representing the value of
+      `target_log_prob_fn` at `accepted_state`.
+    accepted_grads_target_log_prob: Python `list` of `Tensor`s representing the
+      gradient of `accepted_target_log_prob` wrt each `accepted_state`.
 
-  Example: Simple quadratic potential.
+  Raises:
+    ValueError: if there isn't one `step_size` or a list with same length as
+      `current_state`.
+  """
+  with ops.name_scope(
+      name, "hmc_kernel",
+      [current_state, step_size, num_leapfrog_steps, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [current_state_parts, step_sizes, current_target_log_prob,
+       current_grads_target_log_prob] = _prepare_args(
+           target_log_prob_fn, current_state, step_size,
+           current_target_log_prob, current_grads_target_log_prob,
+           maybe_expand=True)
+      independent_chain_ndims = distributions_util.prefer_static_rank(
+          current_target_log_prob)
+      def init_momentum(s):
+        return random_ops.random_normal(
+            shape=array_ops.shape(s),
+            dtype=s.dtype.base_dtype,
+            seed=distributions_util.gen_new_seed(
+                seed, salt="hmc_kernel_momentums"))
+      current_momentums = [init_momentum(s) for s in current_state_parts]
+
+    [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ] = _leapfrog_integrator(current_momentums,
+                             target_log_prob_fn,
+                             current_state_parts,
+                             step_sizes,
+                             num_leapfrog_steps,
+                             current_target_log_prob,
+                             current_grads_target_log_prob)
+
+    energy_change = _compute_energy_change(current_target_log_prob,
+                                           current_momentums,
+                                           proposed_target_log_prob,
+                                           proposed_momentums,
+                                           independent_chain_ndims)
+
+    # u < exp(min(-energy, 0)),  where u~Uniform[0,1)
+    # ==> -log(u) >= max(e, 0)
+    # ==> -log(u) >= e
+    # (Perhaps surprisingly, we don't have a better way to obtain a random
+    # uniform from positive reals, i.e., `tf.random_uniform(minval=0,
+    # maxval=np.inf)` won't work.)
+    random_uniform = random_ops.random_uniform(
+        shape=array_ops.shape(energy_change),
+        dtype=energy_change.dtype,
+        seed=seed)
+    random_positive = -math_ops.log(random_uniform)
+    is_accepted = random_positive >= energy_change
+
+    accepted_target_log_prob = array_ops.where(is_accepted,
+                                               proposed_target_log_prob,
+                                               current_target_log_prob)
+
+    accepted_state_parts = [_choose(is_accepted,
+                                    proposed_state_part,
+                                    current_state_part,
+                                    independent_chain_ndims)
+                            for current_state_part, proposed_state_part
+                            in zip(current_state_parts, proposed_state_parts)]
+
+    accepted_grads_target_log_prob = [
+        _choose(is_accepted,
+                proposed_grad,
+                grad,
+                independent_chain_ndims)
+        for proposed_grad, grad
+        in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)]
+
+    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
+    return [
+        maybe_flatten(accepted_state_parts),
+        KernelResults(
+            acceptance_probs=math_ops.exp(math_ops.minimum(-energy_change, 0.)),
+            current_grads_target_log_prob=accepted_grads_target_log_prob,
+            current_target_log_prob=accepted_target_log_prob,
+            energy_change=energy_change,
+            is_accepted=is_accepted,
+            proposed_grads_target_log_prob=proposed_grads_target_log_prob,
+            proposed_state=maybe_flatten(proposed_state_parts),
+            proposed_target_log_prob=proposed_target_log_prob,
+            random_positive=random_positive,
+        ),
+    ]
+
+
+def _leapfrog_integrator(current_momentums,
+                         target_log_prob_fn,
+                         current_state_parts,
+                         step_sizes,
+                         num_leapfrog_steps,
+                         current_target_log_prob=None,
+                         current_grads_target_log_prob=None,
+                         name=None):
+  """Applies `num_leapfrog_steps` of the leapfrog integrator.
+
+  Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`.
+
+  #### Examples:
+
+  ##### Simple quadratic potential.
 
   ```python
-  def potential_and_grad(position):
-    return tf.reduce_sum(0.5 * tf.square(position)), position
+  tfd = tf.contrib.distributions
+
+  dims = 10
+  num_iter = int(1e3)
+  dtype = np.float32
+
   position = tf.placeholder(np.float32)
   momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_integrator(
-    0.1, 3, position, momentum, potential_and_grad, grad)
 
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
+  [
+      new_momentums,
+      new_positions,
+  ] = hmc._leapfrog_integrator(
+      current_momentums=[momentum],
+      target_log_prob_fn=tfd.MultivariateNormalDiag(
+          loc=tf.zeros(dims, dtype)).log_prob,
+      current_state_parts=[position],
+      step_sizes=0.1,
+      num_leapfrog_steps=3)[:2]
+
+  sess.graph.finalize()  # No more graph building.
+
+  momentum_ = np.random.randn(dims).astype(dtype)
+  position_ = np.random.randn(dims).astype(dtype)
+
+  positions = np.zeros([num_iter, dims], dtype)
+  for i in xrange(num_iter):
+    position_, momentum_ = sess.run(
+        [new_momentums[0], new_position[0]],
+        feed_dict={position: position_, momentum: momentum_})
+    positions[i] = position_
+
+  plt.plot(positions[:, 0]);  # Sinusoidal.
   ```
-  """
-  def leapfrog_wrapper(step_size, x, m, grad, l):
-    x, m, _, grad = leapfrog_step(step_size, x, m, potential_and_grad, grad)
-    return step_size, x, m, grad, l + 1
-
-  def counter_fn(a, b, c, d, counter):  # pylint: disable=unused-argument
-    return counter < n_steps
-
-  with ops.name_scope(name, 'leapfrog_integrator',
-                      [step_size, n_steps, initial_position, initial_momentum,
-                       initial_grad]):
-    _, new_x, new_m, new_grad, _ = control_flow_ops.while_loop(
-        counter_fn, leapfrog_wrapper, [step_size, initial_position,
-                                       initial_momentum, initial_grad,
-                                       array_ops.constant(0)], back_prop=False)
-    # We're counting on the runtime to eliminate this redundant computation.
-    new_potential, new_grad = potential_and_grad(new_x)
-  return new_x, new_m, new_potential, new_grad
-
-
-def leapfrog_step(step_size, position, momentum, potential_and_grad, grad,
-                  name=None):
-  """Applies one step of the leapfrog integrator.
-
-  Assumes a simple quadratic kinetic energy function: 0.5 * ||momentum||^2.
 
   Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `position` and returns the potential energy and its gradient at that
-      position.
-    grad: Tensor with the value of the gradient of the potential energy
-      at `position`.
+    current_momentums: Tensor containing the value(s) of the momentum
+      variable(s) to update.
+    target_log_prob_fn: Python callable which takes an argument like
+      `*current_state_parts` and returns its (possibly unnormalized) log-density
+      under the target distribution.
+    current_state_parts: Python `list` of `Tensor`s representing the current
+      state(s) of the Markov chain(s). The first `independent_chain_ndims` of
+      the `Tensor`(s) index different chains.
+    step_sizes: Python `list` of `Tensor`s representing the step size for the
+      leapfrog integrator. Must broadcast with the shape of
+      `current_state_parts`.  Larger step sizes lead to faster progress, but
+      too-large step sizes make rejection exponentially more likely. When
+      possible, it's often helpful to match per-variable step sizes to the
+      standard deviations of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn(*current_state_parts)`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob_fn(*current_state_parts`) wrt
+      `current_state_parts`. Must have same shape as `current_state_parts`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
     name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_leapfrog_integrator").
 
   Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `position`.
+    proposed_momentums: Updated value of the momentum.
+    proposed_state_parts: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state_parts`.
+    proposed_target_log_prob: `Tensor` representing the value of
+      `target_log_prob_fn` at `accepted_state`.
+    proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt
+      `accepted_state`.
 
-  Example: Simple quadratic potential.
-
-  ```python
-  def potential_and_grad(position):
-    # Simple quadratic potential
-    return tf.reduce_sum(0.5 * tf.square(position)), position
-  position = tf.placeholder(np.float32)
-  momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_step(
-    0.1, position, momentum, potential_and_grad, grad)
-
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
-  ```
+  Raises:
+    ValueError: if `len(momentums) != len(state_parts)`.
+    ValueError: if `len(state_parts) != len(step_sizes)`.
+    ValueError: if `len(state_parts) != len(grads_target_log_prob)`.
+    TypeError: if `not target_log_prob.dtype.is_floating`.
   """
-  with ops.name_scope(name, 'leapfrog_step', [step_size, position, momentum,
-                                              grad]):
-    momentum -= 0.5 * step_size * grad
-    position += step_size * momentum
-    potential, grad = potential_and_grad(position)
-    momentum -= 0.5 * step_size * grad
+  def _loop_body(step,
+                 current_momentums,
+                 current_state_parts,
+                 ignore_current_target_log_prob,  # pylint: disable=unused-argument
+                 current_grads_target_log_prob):
+    return [step + 1] + list(_leapfrog_step(current_momentums,
+                                            target_log_prob_fn,
+                                            current_state_parts,
+                                            step_sizes,
+                                            current_grads_target_log_prob))
 
-  return position, momentum, potential, grad
+  with ops.name_scope(
+      name, "hmc_leapfrog_integrator",
+      [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps,
+       current_target_log_prob, current_grads_target_log_prob]):
+    if len(current_momentums) != len(current_state_parts):
+      raise ValueError("`momentums` must be in one-to-one correspondence "
+                       "with `state_parts`")
+    num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps,
+                                               name="num_leapfrog_steps")
+    current_target_log_prob, current_grads_target_log_prob = (
+        _maybe_call_fn_and_grads(
+            target_log_prob_fn,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob))
+    return control_flow_ops.while_loop(
+        cond=lambda iter_, *args: iter_ < num_leapfrog_steps,
+        body=_loop_body,
+        loop_vars=[
+            0,  # iter_
+            current_momentums,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob,
+        ],
+        back_prop=False)[1:]  # Lop-off "iter_".
+
+
+def _leapfrog_step(current_momentums,
+                   target_log_prob_fn,
+                   current_state_parts,
+                   step_sizes,
+                   current_grads_target_log_prob,
+                   name=None):
+  """Applies one step of the leapfrog integrator."""
+  with ops.name_scope(
+      name, "_leapfrog_step",
+      [current_momentums, current_state_parts, step_sizes,
+       current_grads_target_log_prob]):
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(current_momentums,
+                                 step_sizes,
+                                 current_grads_target_log_prob)]
+    proposed_state_parts = [x + ss * m for x, ss, m
+                            in zip(current_state_parts,
+                                   step_sizes,
+                                   proposed_momentums)]
+    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
+    if not proposed_target_log_prob.dtype.is_floating:
+      raise TypeError("`target_log_prob_fn` must produce a `Tensor` "
+                      "with `float` `dtype`.")
+    proposed_grads_target_log_prob = gradients_ops.gradients(
+        proposed_target_log_prob, proposed_state_parts)
+    if any(g is None for g in proposed_grads_target_log_prob):
+      raise ValueError(
+          "Encountered `None` gradient. Does your target `target_log_prob_fn` "
+          "access all `tf.Variable`s via `tf.get_variable`?\n"
+          "  current_state_parts: {}\n"
+          "  proposed_state_parts: {}\n"
+          "  proposed_grads_target_log_prob: {}".format(
+              current_state_parts,
+              proposed_state_parts,
+              proposed_grads_target_log_prob))
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(proposed_momentums,
+                                 step_sizes,
+                                 proposed_grads_target_log_prob)]
+    return [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ]
+
+
+def _compute_energy_change(current_target_log_prob,
+                           current_momentums,
+                           proposed_target_log_prob,
+                           proposed_momentums,
+                           independent_chain_ndims,
+                           name=None):
+  """Helper to `kernel` which computes the energy change."""
+  with ops.name_scope(
+      name, "compute_energy_change",
+      ([current_target_log_prob, proposed_target_log_prob,
+        independent_chain_ndims] +
+       current_momentums + proposed_momentums)):
+    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
+    # since they're a mouthful and lets us inline more.
+    lk0, lk1 = [], []
+    for current_momentum, proposed_momentum in zip(current_momentums,
+                                                   proposed_momentums):
+      axis = math_ops.range(independent_chain_ndims,
+                            array_ops.rank(current_momentum))
+      lk0.append(_log_sum_sq(current_momentum, axis))
+      lk1.append(_log_sum_sq(proposed_momentum, axis))
+
+    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
+                                                  axis=-1)
+    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
+                                                  axis=-1)
+    lp0 = -current_target_log_prob   # log_potential
+    lp1 = -proposed_target_log_prob  # proposed_log_potential
+    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
+                        axis=-1)
+
+    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
+    # Thus we will replace such rows with infinite energy change which implies
+    # rejection. Recall that float-comparisons with NaN are always False.
+    is_sum_determinate = (
+        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
+        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
+    is_sum_determinate = array_ops.tile(
+        is_sum_determinate[..., array_ops.newaxis],
+        multiples=array_ops.concat([
+            array_ops.ones(array_ops.rank(is_sum_determinate),
+                           dtype=dtypes.int32),
+            [4],
+        ], axis=0))
+    x = array_ops.where(is_sum_determinate,
+                        x,
+                        array_ops.fill(array_ops.shape(x),
+                                       value=x.dtype.as_numpy_dtype(np.inf)))
+
+    return math_ops.reduce_sum(x, axis=-1)
+
+
+def _choose(is_accepted,
+            accepted,
+            rejected,
+            independent_chain_ndims,
+            name=None):
+  """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where."""
+  def _expand_is_accepted_like(x):
+    with ops.name_scope("_choose"):
+      expand_shape = array_ops.concat([
+          array_ops.shape(is_accepted),
+          array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)],
+                         dtype=dtypes.int32),
+      ], axis=0)
+      multiples = array_ops.concat([
+          array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32),
+          array_ops.shape(x)[independent_chain_ndims:],
+      ], axis=0)
+      m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape),
+                         multiples)
+      m.set_shape(x.shape)
+      return m
+  with ops.name_scope(name, "_choose", values=[
+      is_accepted, accepted, rejected, independent_chain_ndims]):
+    return array_ops.where(_expand_is_accepted_like(accepted),
+                           accepted,
+                           rejected)
+
+
+def _maybe_call_fn_and_grads(fn,
+                             fn_arg_list,
+                             fn_result=None,
+                             grads_fn_result=None,
+                             description="target_log_prob"):
+  """Helper which computes `fn_result` and `grads` if needed."""
+  fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list)
+                 else [fn_arg_list])
+  if fn_result is None:
+    fn_result = fn(*fn_arg_list)
+  if not fn_result.dtype.is_floating:
+    raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format(
+        description))
+  if grads_fn_result is None:
+    grads_fn_result = gradients_ops.gradients(
+        fn_result, fn_arg_list)
+  if len(fn_arg_list) != len(grads_fn_result):
+    raise ValueError("`{}` must be in one-to-one correspondence with "
+                     "`grads_{}`".format(*[description]*2))
+  if any(g is None for g in grads_fn_result):
+    raise ValueError("Encountered `None` gradient.")
+  return fn_result, grads_fn_result
+
+
+def _prepare_args(target_log_prob_fn, state, step_size,
+                  target_log_prob=None, grads_target_log_prob=None,
+                  maybe_expand=False, description="target_log_prob"):
+  """Helper which processes input args to meet list-like assumptions."""
+  state_parts = list(state) if _is_list_like(state) else [state]
+  state_parts = [ops.convert_to_tensor(s, name="state")
+                 for s in state_parts]
+  target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads(
+      target_log_prob_fn,
+      state_parts,
+      target_log_prob,
+      grads_target_log_prob,
+      description)
+  step_sizes = list(step_size) if _is_list_like(step_size) else [step_size]
+  step_sizes = [
+      ops.convert_to_tensor(
+          s, name="step_size", dtype=target_log_prob.dtype)
+      for s in step_sizes]
+  if len(step_sizes) == 1:
+    step_sizes *= len(state_parts)
+  if len(state_parts) != len(step_sizes):
+    raise ValueError("There should be exactly one `step_size` or it should "
+                     "have same length as `current_state`.")
+  if maybe_expand:
+    maybe_flatten = lambda x: x
+  else:
+    maybe_flatten = lambda x: x if _is_list_like(state) else x[0]
+  return [
+      maybe_flatten(state_parts),
+      maybe_flatten(step_sizes),
+      target_log_prob,
+      grads_target_log_prob,
+  ]
+
+
+def _is_list_like(x):
+  """Helper which returns `True` if input is `list`-like."""
+  return isinstance(x, (tuple, list))
+
+
+def _log_sum_sq(x, axis=None):
+  """Computes log(sum(x**2))."""
+  return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
index 4b5d5ba0de6..754b7bc3270 100644
--- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -48,8 +48,9 @@ class CreateTreeEnsembleVariableOp : public OpKernel {
     if (!result->InitFromSerialized(tree_ensemble_config_t->scalar<string>()(),
                                     stamp_token)) {
       result->Unref();
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unable to parse tree ensemble config."));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unable to parse tree ensemble config."));
     }
 
     // Only create one, if one does not exist already. Report status for all
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
index f8086b0c2bb..b3fe38614e0 100644
--- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -47,8 +47,8 @@ namespace boosted_trees {
 using boosted_trees::learner::LearnerConfig;
 using boosted_trees::learner::LearningRateConfig;
 using boosted_trees::learner::LearningRateDropoutDrivenConfig;
-using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::models::DecisionTreeEnsembleResource;
+using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::utils::DropoutUtils;
 using boosted_trees::utils::TensorUtils;
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 88f30064076..0f4c2298f56 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -36,13 +36,12 @@
 namespace tensorflow {
 
 using ::boosted_trees::QuantileConfig;
-using boosted_trees::utils::TensorUtils;
 using boosted_trees::QuantileStreamResource;
+using boosted_trees::utils::TensorUtils;
 
 namespace {
 const char* const kExampleWeightsName = "example_weights";
 const char* const kMaxElementsName = "max_elements";
-const char* const kHandleName = "handle";
 const char* const kNextStampTokenName = "next_stamp_token";
 const char* const kStampTokenName = "stamp_token";
 const char* const kAreBucketsReadyName = "are_buckets_ready";
@@ -52,7 +51,6 @@ const char* const kNumSparseFeaturesName = "num_sparse_features";
 const char* const kSparseBucketsName = "sparse_buckets";
 const char* const kSparseValuesName = "sparse_values";
 const char* const kSparseIndicesName = "sparse_indices";
-const char* const kSparseStreamsStateName = "sparse_streams_state";
 const char* const kSparseSummariesName = "sparse_summaries";
 const char* const kSparseConfigName = "sparse_config";
 const char* const kSparseOutputTensorName = "sparse_quantiles";
@@ -60,7 +58,6 @@ const char* const kSparseOutputTensorName = "sparse_quantiles";
 const char* const kDenseBucketsName = "dense_buckets";
 const char* const kDenseConfigName = "dense_config";
 const char* const kDenseOutputTensorName = "dense_quantiles";
-const char* const kDenseStreamsStateName = "dense_streams_state";
 const char* const kDenseSummariesName = "dense_summaries";
 const char* const kDenseValuesName = "dense_values";
 const char* const kNumDenseFeaturesName = "num_dense_features";
@@ -387,7 +384,7 @@ class MakeQuantileSummariesOp : public OpKernel {
         protobuf::Arena arena;
         ::boosted_trees::QuantileSummaryState* summary_proto =
             protobuf::Arena::CreateMessage<
-            ::boosted_trees::QuantileSummaryState>(&arena);
+                ::boosted_trees::QuantileSummaryState>(&arena);
         const auto& summary = stream.GetFinalSummary();
         CopySummaryToProto(summary, summary_proto);
         // Output to tensor.
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 18b4abd654e..44a8ffaf4b2 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -34,10 +34,10 @@
 
 namespace tensorflow {
 
+using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 using boosted_trees::learner::SplitInfo;
 using boosted_trees::learner::stochastic::GradientStats;
 using boosted_trees::learner::stochastic::NodeStats;
-using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 
 namespace {
 const int32 DUMMY_FEATURE_DIMENSION = -1;
@@ -47,9 +47,8 @@ class BaseBuildSplitOp : public OpKernel {
  public:
   explicit BaseBuildSplitOp(OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(
-        context,
-        context->GetAttr("feature_column_group_id", &feature_column_group_id_));
+    OP_REQUIRES_OK(context, context->GetAttr("feature_column_group_id",
+                                             &feature_column_group_id_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("l1_regularization", &l1_regularization_));
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index a9a229c8ae0..90a0655201f 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -134,10 +134,9 @@ void SerializeScalarAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   // Feature ids tensor has ids of feature columns and their dimensions.
@@ -149,15 +148,14 @@ void SerializeScalarAccumulatorToOutput(
 
   Tensor* gradients_t = nullptr;
   OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_gradients", TensorShape({num_slots}),
-                               &gradients_t));
+      context, context->allocate_output(
+                   "output_gradients", TensorShape({num_slots}), &gradients_t));
   auto gradients = gradients_t->vec<float>();
 
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(
-                     "output_hessians", TensorShape({num_slots}), &hessians_t));
+  OP_REQUIRES_OK(
+      context, context->allocate_output("output_hessians",
+                                        TensorShape({num_slots}), &hessians_t));
   auto hessians = hessians_t->vec<float>();
 
   int i = 0;
@@ -177,10 +175,9 @@ void SerializeTensorAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   Tensor* feature_ids_t = nullptr;
@@ -202,9 +199,8 @@ void SerializeTensorAccumulatorToOutput(
   int64 num_hessian_elements = hessian_shape.num_elements();
   hessian_shape.InsertDim(0, num_slots);
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_hessians", hessian_shape, &hessians_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_hessians",
+                                                   hessian_shape, &hessians_t));
   auto hessians = hessians_t->flat_outer_dims<float>();
 
   int i = 0;
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
index f867e77d3ef..8bca132acfd 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
@@ -17,8 +17,8 @@
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
 
-using tensorflow::test::AsTensor;
 using std::vector;
+using tensorflow::test::AsTensor;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index 1c4181f1b13..8ad97fedc92 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -15,9 +15,9 @@
 #ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 #define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 
+#include <cmath>
 #include <memory>
 #include <vector>
-#include <cmath>
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
index cbe26ba918d..705b65e9db9 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
@@ -22,9 +22,9 @@ namespace tensorflow {
 namespace boosted_trees {
 namespace testutil {
 
+using boosted_trees::trees::DenseFloatBinarySplit;
 using tensorflow::boosted_trees::trees::DecisionTreeConfig;
 using tensorflow::boosted_trees::trees::TreeNode;
-using boosted_trees::trees::DenseFloatBinarySplit;
 
 namespace {
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
index 9de3e32b097..609519e8b11 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
@@ -25,8 +25,8 @@ namespace boosted_trees {
 namespace utils {
 namespace {
 
-using test::AsTensor;
 using errors::InvalidArgument;
+using test::AsTensor;
 
 class BatchFeaturesTest : public ::testing::Test {};
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
index 38f0151255b..db34db998a7 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
@@ -23,10 +23,10 @@
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/logging.h"
 
+using tensorflow::Status;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::Status;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
index ce7632e5898..02f972c8e00 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@@ -26,9 +26,9 @@
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
 
+using std::unordered_set;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig;
-using std::unordered_set;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
index bb57dcf8ae7..ae99d53a2cf 100644
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -19,8 +19,8 @@
 
 namespace tensorflow {
 namespace boosted_trees {
-using shape_inference::InferenceContext;
 using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(QuantileStreamResource);
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index 0d27ddaf3a1..5d0ebbf73ce 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -18,9 +18,9 @@
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_OP("BuildDenseInequalitySplits")
     .Attr("feature_column_group_id: int")
diff --git a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
index 0354f7853cb..179505eef01 100644
--- a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
@@ -19,9 +19,9 @@
 
 namespace tensorflow {
 namespace boosted_trees {
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(StatsAccumulatorScalarResource);
 
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index eefa7ef0dcc..81f58de28cb 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -183,11 +183,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(num_quantiles + 1, len(buckets))
       self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
 
-  def _testStreamingQuantileBucketsHelper(self, inputs):
+  def _testStreamingQuantileBucketsHelper(
+      self, inputs, num_quantiles=3, expected_buckets=None):
     """Helper to test quantile buckets on different inputs."""
 
-    # Use 3 quantiles, 4 boundaries for simplicity.
-    num_quantiles = 3
     # set generate_quantiles to True since the test will generate fewer
     # boundaries otherwise.
     with self.test_session() as sess:
@@ -213,7 +212,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       buckets, are_ready_flush = (sess.run(
           [buckets, are_ready_flush]))
       self.assertEqual(True, are_ready_flush)
+      # By default, use 3 quantiles, 4 boundaries for simplicity.
       self.assertEqual(num_quantiles + 1, len(buckets))
+      if expected_buckets:
+        self.assertAllEqual(buckets, expected_buckets)
 
   def testStreamingQuantileBucketsRepeatedSingleValue(self):
     inputs = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
@@ -231,6 +233,28 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     inputs = [5]
     self._testStreamingQuantileBucketsHelper(inputs)
 
+  def testStreamingQuantileBucketsEqualDistributionInSequence(self):
+    # Input pattern is of the form [1, 1, 1, 2, 2, 2, 3, 3, 3, ...]
+    ones = 100 * [1]
+    inputs = []
+    for i in range(1, 101):
+      inputs += [i * k for k in ones]
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
+  def testStreamingQuantileBucketsEqualDistributionInterleaved(self):
+    # Input pattern is of the form [1, 2, 3, 1, 2, 3, 1, 2, 3, ...]
+    sequence = range(1, 101)
+    inputs = []
+    for _ in range(1, 101):
+      inputs += sequence
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
   def testStreamingQuantileBuckets(self):
     """Sets up the quantile summary op test as follows.
 
diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index b281a4c6d1c..7a5f329b7ab 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -81,32 +81,32 @@ def _scheduled_stamp_resource_op_runner(batch, stamp):
   if not batch:
     return
   arg_keys = set(batch[0].args.keys())
-  grouped_args = collections.defaultdict(list)
+  grouped_args = collections.OrderedDict()
   resource_handles = []
   # Check that the set of arguments is the same across all the scheduled ops.
   for op in batch:
     if set(op.args.keys()) != arg_keys:
       raise ValueError("Mismatching arguments: %s, %s.", op.args, arg_keys)
     for key in arg_keys:
-      grouped_args[key].append(op.args[key])
+      grouped_args.setdefault(key, []).append(op.args[key])
     resource_handles.append(op.resource_handle)
   # Move all the inputs to the op device in one RPC.
-  grouped_args = {
-      k: _move_tensors(v, resource_handles[0].device)
-      for k, v in grouped_args.items()
-  }
+  grouped_args = collections.OrderedDict(
+      (k, _move_tensors(v, resource_handles[0].device))
+      for k, v in sorted(grouped_args.items()))
   with ops.device(resource_handles[0].device):
     return batch[0].op(resource_handles, stamp, **grouped_args)
 
 
 def run_handler_scheduled_ops(per_handler_ops, stamp, worker_device):
   """Given a dictionary of ops for each handler, runs them in batch."""
-  batched_ops = collections.defaultdict(list)
+  batched_ops = collections.OrderedDict()
   # Group the ops by their batching_key. Ops that share the same batching key
   # can be executed together.
-  for handler in sorted(per_handler_ops.keys()):
+  for handler in per_handler_ops.keys():
     for op in per_handler_ops[handler]:
-      batched_ops[(op.batching_key(), op.batch_runner_fn())].append(op)
+      key = (op.batching_key(), op.batch_runner_fn())
+      batched_ops.setdefault(key, []).append(op)
   op_results = {}
   for batch in batched_ops.values():
     # Run each of the batched ops using its runner.
diff --git a/tensorflow/contrib/boosted_trees/python/training/__init__.py b/tensorflow/contrib/boosted_trees/python/training/__init__.py
new file mode 100644
index 00000000000..b569ac5fdb6
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""training module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
new file mode 100644
index 00000000000..c1750117cd7
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""functions module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index b95956dae2a..f0b66dcbbe1 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 
 from tensorflow.contrib import learn
@@ -163,7 +164,7 @@ def extract_features(features, feature_columns):
     scope = "gbdt"
     with variable_scope.variable_scope(scope):
       feature_columns = list(feature_columns)
-      transformed_features = {}
+      transformed_features = collections.OrderedDict()
       for fc in feature_columns:
         # pylint: disable=protected-access
         if isinstance(fc, feature_column_lib._EmbeddingColumn):
@@ -681,13 +682,13 @@ class GradientBoostedDecisionTreeModel(object):
                               control_flow_ops.no_op))
 
     # Update handler stats.
-    handler_reads = {}
+    handler_reads = collections.OrderedDict()
     for handler in handlers:
       handler_reads[handler] = handler.scheduled_reads()
 
     handler_results = batch_ops_utils.run_handler_scheduled_ops(
         handler_reads, ensemble_stamp, worker_device)
-    per_handler_updates = {}
+    per_handler_updates = collections.OrderedDict()
     # Two values per handler. First one is if the handler is active for the
     # current layer. The second one is if the handler is going to be active
     # for the next layer.
diff --git a/tensorflow/contrib/boosted_trees/python/utils/__init__.py b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
new file mode 100644
index 00000000000..6ceb150c265
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""utils module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
index 59f23332983..fea6b15640d 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
@@ -399,6 +399,6 @@ const string kTestEmptyRow = R"({
     }]}]})";
 
 }  // namespace
-}  // namepsace tensorflow
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 15abd2be038..80e18a43a71 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -34,6 +34,7 @@ py_library(
         ":cluster_resolver_py",
         ":gce_cluster_resolver_py",
         ":tpu_cluster_resolver_py",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index d17501e87e7..b4d8cd4a7cf 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -26,3 +26,15 @@ from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import
 from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
 from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
 # pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'TPUClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 2e75ac226ea..a6a6e642e4e 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -143,7 +143,8 @@ class TPUClusterResolver(ClusterResolver):
       request = self._service.projects().locations().nodes().get(name=full_name)
       response = request.execute()
 
-      instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-      worker_list.append(instance_url)
+      if 'health' in response and response['health'] == 'HEALTHY':
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list.append(instance_url)
 
     return ClusterSpec({self._job_name: worker_list})
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 0c4730613af..4fd34629cf7 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -105,7 +105,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -126,7 +127,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -147,11 +149,13 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         },
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
             'ipAddress': '10.4.5.6',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -169,15 +173,54 @@ class TPUClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testHealthyTpuNodeRetrieval(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        },
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
+            'ipAddress': '10.4.5.6',
+            'port': '8470',
+        },
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-3': {
+            'ipAddress': '10.7.8.9',
+            'port': '8470',
+            'health': 'UNHEALTHY'
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu_names=['test-tpu-2', 'test-tpu-1', 'test-tpu-3'],
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'tpu_worker'
+      tasks {
+        key: 0
+        value: '10.1.2.3:8470'
+      }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
   def testGetMasterMultipleEntries(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         },
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
             'ipAddress': '10.4.5.6',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index aedb793d2ae..fd05fa6d472 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
+set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a)
 
 if(WIN32)
   set(protobuf_STATIC_LIBRARIES 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 1260b81b07a..3bae1edadb8 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -6,6 +6,7 @@ tensorflow/core/example
 tensorflow/core/framework
 tensorflow/core/lib
 tensorflow/core/lib/core
+tensorflow/core/profiler
 tensorflow/core/protobuf
 tensorflow/core/util
 tensorflow/examples
@@ -216,6 +217,8 @@ tensorflow/contrib/input_pipeline/python/ops
 tensorflow/contrib/integrate
 tensorflow/contrib/integrate/python
 tensorflow/contrib/integrate/python/ops
+tensorflow/contrib/kafka/python
+tensorflow/contrib/kafka/python/ops
 tensorflow/contrib/keras
 tensorflow/contrib/keras/api
 tensorflow/contrib/keras/api/keras
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 138993db352..c42bc35ce7e 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -30,6 +30,7 @@ set(tf_op_lib_names
 		"list_ops"
     "lookup_ops"
     "logging_ops"
+    "manip_ops"
     "math_ops"
     "nn_ops"
     "no_op"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8862390d2b6..34c466fa01e 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -307,7 +307,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
     # containing the wrappers.
     add_custom_command(
       OUTPUT ${GENERATE_PYTHON_OP_LIB_DESTINATION}
-      COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
+      COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
       DEPENDS ${tf_python_op_lib_name}_gen_python
     )
 
@@ -335,6 +335,7 @@ GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("manip_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index f67698eb99a..77ea914380d 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -31,6 +31,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import codecs
 import io
 import os
 import re
@@ -103,7 +104,7 @@ def main():
   for lib_path in args.input:
     proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path],
                             stdout=subprocess.PIPE)
-    for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+    for line in codecs.getreader("utf-8")(proc.stdout):
       cols = line.split()
       if len(cols) < 2:
         continue
@@ -131,7 +132,7 @@ def main():
     # We compare on undname but use the decorated name from candidates.
     dupes = 0
     proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
-    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+    for idx, line in enumerate(codecs.getreader("utf-8")(proc.stdout)):
       decorated = candidates[idx]
       if decorated in taken:
         # Symbol is already in output, done.
diff --git a/tensorflow/contrib/coder/README.md b/tensorflow/contrib/coder/README.md
index e1e867db5aa..c6c379c4588 100644
--- a/tensorflow/contrib/coder/README.md
+++ b/tensorflow/contrib/coder/README.md
@@ -30,7 +30,7 @@ following sense:
   around,
 - The number of CDF axes does not extend, i.e., `CDF.ndim == data.ndim + 1`.
 
-In the previous example where data has shape (10, 10), the followings are
+In the previous example where data has shape (10, 10), the following are
 acceptable CDF shapes:
 
 - (10, 10, 65)
diff --git a/tensorflow/contrib/coder/kernels/range_coder.cc b/tensorflow/contrib/coder/kernels/range_coder.cc
index f4f076b6c4e..21b35155ff3 100644
--- a/tensorflow/contrib/coder/kernels/range_coder.cc
+++ b/tensorflow/contrib/coder/kernels/range_coder.cc
@@ -276,7 +276,7 @@ void RangeEncoder::Finalize(string* sink) {
     }
   } else if (base_ != 0) {
     // If base == 0, then pick 0 from [base, base + size) and no zeros are
-    // explcitly written.
+    // explicitly written.
     //
     // Otherwise, pick (base + (2^16 - base[16:0])), i.e., round up base to the
     // next multiple of 2^16. As 2^16 < size, this value should be in the
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index 2108e42bce4..29a593f6bcf 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -169,6 +170,7 @@ class JITTest(test.TestCase):
       self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s)
 
 
+@test_util.with_c_api
 class CompilationEnabledInGradientTest(test.TestCase):
 
   def testCompilationInGradient(self):
@@ -188,7 +190,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
       for cg in c_grad_ops:
         self.assertTrue(cg.get_attr("_XlaCompile"))
       for ncg in nc_grad_ops:
-        with self.assertRaisesRegexp(ValueError, "No attr named"):
+        with self.assertRaisesRegexp(ValueError, "[Nn]o attr named"):
           ncg.get_attr("_XlaCompile")
 
       # d/dx (x ** 4) = 4 * (x ** 3)
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 9e41e678571..1a79bf066c3 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -251,9 +251,8 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
       TF_RETURN_IF_ERROR(c->GetAttr("num_params", &num_params));
       // Set shape for weight matrices
       for (int i = 0; i < num_params; i++) {
-        c->set_output(i,
-                      c->Matrix(InferenceContext::kUnknownDim,
-                                InferenceContext::kUnknownDim));
+        c->set_output(i, c->Matrix(InferenceContext::kUnknownDim,
+                                   InferenceContext::kUnknownDim));
       }
       // Set shape for bias vectors
       for (int i = 0; i < num_params; i++) {
@@ -300,6 +299,7 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
     Each layer may contain multiple parameter sets, with each set consisting of
     a weight matrix and a bias vector.
-)doc", kCudnnRNNCommonAttrs));
+)doc",
+                         kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 4fc5ff1bd18..56c562a3bad 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import time
 
+from six.moves import xrange
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import lstm_ops
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 1cf0202fd88..04a21f2b0f5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -126,6 +126,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 015f69c5673..0c2827b1e49 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -744,6 +744,23 @@ class BatchDatasetSerializationTest(
                         lambda: self._build_dataset_dense_to_sparse(diff_comp),
                         num_outputs)
 
+  def _sparse(self, i):
+    return sparse_tensor.SparseTensorValue(
+        indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+  def _build_dataset_sparse(self, batch_size=5):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(batch_size)
+
+  def testSparseCore(self):
+    self.run_core_tests(self._build_dataset_sparse,
+                        lambda: self._build_dataset_sparse(2), 2)
+
+  def _build_dataset_nested_sparse(self):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(5).batch(2)
+
+  def testNestedSparseCore(self):
+    self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
+
 
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 4d984bb4d76..6de93059d8c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -41,8 +41,7 @@ class GroupByWindowTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
         .apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4))
-        .make_initializable_iterator())
+                                     4)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -53,7 +52,8 @@ class GroupByWindowTest(test.TestCase):
         while True:
           result = sess.run(get_next)
           self.assertTrue(
-              all(x % 2 == 0 for x in result) or all(x % 2 == 1)
+              all(x % 2 == 0
+                  for x in result) or all(x % 2 == 1)
               for x in result)
           counts.append(result.shape[0])
 
@@ -116,8 +116,8 @@ class GroupByWindowTest(test.TestCase):
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))
-        .make_initializable_iterator())
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
+                                     32)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -136,7 +136,8 @@ class GroupByWindowTest(test.TestCase):
           window.padded_batch(
               4, padded_shapes=tensor_shape.TensorShape([None])),
           window.padded_batch(
-              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),))
+              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
+      ))
 
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
@@ -200,9 +201,10 @@ class BucketTest(test.TestCase):
     # dynamically and does not rely on static shape information about
     # the arguments.
     return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-            32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape([None]),
-                 tensor_shape.TensorShape([3])))))
+        (dataset_ops.Dataset.from_tensors(bucket),
+         window.padded_batch(
+             32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
+                 [None]), tensor_shape.TensorShape([3])))))
 
   def testSingleBucket(self):
 
@@ -307,12 +309,13 @@ class BucketTest(test.TestCase):
 
     def _dynamic_pad_fn(bucket, window, _):
       return dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-              32, {
-                  "x": tensor_shape.TensorShape([]),
-                  "y": tensor_shape.TensorShape([None]),
-                  "z": tensor_shape.TensorShape([3])
-              })))
+          (dataset_ops.Dataset.from_tensors(bucket),
+           window.padded_batch(
+               32, {
+                   "x": tensor_shape.TensorShape([]),
+                   "y": tensor_shape.TensorShape([None]),
+                   "z": tensor_shape.TensorShape([3])
+               })))
 
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 7cde6e05b24..dbc35097ddd 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -24,9 +24,11 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -34,14 +36,29 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
 
+def remove_variants(get_next_op):
+  # TODO(b/72408568): Remove this once session.run can get
+  # variant tensors.
+  """Remove variants from a nest structure, so sess.run will execute."""
+
+  def _remove_variant(x):
+    if isinstance(x, ops.Tensor) and x.dtype == dtypes.variant:
+      return ()
+    else:
+      return x
+
+  return nest.map_structure(_remove_variant, get_next_op)
+
+
 class DatasetSerializationTestBase(test.TestCase):
   """Base class for testing serializable datasets."""
 
   def tearDown(self):
     self._delete_ckpt()
 
-  # TODO(b/70988345): Support native `tf.SparseTensor` objects and get rid of
-  # `sparse_tensors` argument.
+  # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
+  # (deprecated) saveable `SparseTensorSliceDataset`, once the API
+  # `from_sparse_tensor_slices()`and related tests are deleted.
   def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
     """Runs the core tests.
 
@@ -233,10 +250,10 @@ class DatasetSerializationTestBase(test.TestCase):
       saver = self._import_meta_graph()
       init_op, get_next_op = self._get_iterator_ops_from_collection(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(num_outputs):
           actual.append(sess.run(get_next_op))
         if verify_exhausted:
@@ -296,6 +313,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       _, get_next_op, saver = self._build_graph(
           ds_fn2, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -356,6 +374,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       get_next_op, saver = self._build_empty_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -389,9 +408,9 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, saver = self._build_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(break_point):
           sess.run(get_next_op)
         with self.assertRaises(error):
@@ -485,20 +504,20 @@ class DatasetSerializationTestBase(test.TestCase):
       else:
         init_op, get_next_op, saver = self._build_graph(
             ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       return init_op, get_next_op, saver
 
     for i in range(len(break_points) + 1):
       with ops.Graph().as_default() as g:
         init_op, get_next_op, saver = get_ops()
+        get_next_op = remove_variants(get_next_op)
         with self.test_session(graph=g) as sess:
           if ckpt_saved:
             if init_before_restore:
-              sess.run(variables.global_variables_initializer())
-              sess.run(init_op)
+              self._initialize(init_op, sess)
             self._restore(saver, sess)
           else:
-            sess.run(variables.global_variables_initializer())
-            sess.run(init_op)
+            self._initialize(init_op, sess)
           start = break_points[i - 1] if i > 0 else 0
           end = break_points[i] if i < len(break_points) else num_outputs
           num_iters = end - start
@@ -562,13 +581,16 @@ class DatasetSerializationTestBase(test.TestCase):
       get_next = sparse_tensor.SparseTensor(*iterator.get_next())
     else:
       get_next = iterator.get_next()
-    self._add_iterator_ops_to_collection(init_op, get_next, sparse_tensors)
+    self._add_iterator_ops_to_collection(init_op, get_next, ds_fn,
+                                         sparse_tensors)
     saver = saver_lib.Saver(allow_empty=True)
     return init_op, get_next, saver
 
   def _build_empty_graph(self, ds_fn, sparse_tensors=False):
     iterator = iterator_ops.Iterator.from_structure(
-        self._get_output_types(ds_fn), self._get_output_shapes(ds_fn))
+        self._get_output_types(ds_fn),
+        output_shapes=self._get_output_shapes(ds_fn),
+        output_classes=self._get_output_classes(ds_fn))
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     if sparse_tensors:
@@ -581,12 +603,19 @@ class DatasetSerializationTestBase(test.TestCase):
   def _add_iterator_ops_to_collection(self,
                                       init_op,
                                       get_next,
+                                      ds_fn,
                                       sparse_tensors=False):
     ops.add_to_collection("iterator_ops", init_op)
     # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
     # do not support tuples we flatten the tensors and restore the shape in
     # `_get_iterator_ops_from_collection`.
-    if sparse_tensors:
+
+    # TODO(shivaniagrwal): `output_classes` is a nested structure of classes,
+    # this base class is specific to current test cases. Update when tests are
+    # added with `output_classes` as a nested structure with at least one of the
+    # component being `tf.SparseTensor`.
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       ops.add_to_collection("iterator_ops", get_next.indices)
       ops.add_to_collection("iterator_ops", get_next.values)
       ops.add_to_collection("iterator_ops", get_next.dense_shape)
@@ -596,7 +625,8 @@ class DatasetSerializationTestBase(test.TestCase):
 
   def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
     all_ops = ops.get_collection("iterator_ops")
-    if sparse_tensors:
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       init_op, indices, values, dense_shape = all_ops
       return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
     else:
@@ -611,6 +641,10 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default():
       return ds_fn().output_shapes
 
+  def _get_output_classes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_classes
+
   def _ckpt_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
@@ -621,8 +655,14 @@ class DatasetSerializationTestBase(test.TestCase):
     saver.save(sess, self._ckpt_path())
 
   def _restore(self, saver, sess):
+    sess.run(lookup_ops.tables_initializer())
     saver.restore(sess, self._latest_ckpt())
 
+  def _initialize(self, init_op, sess):
+    sess.run(variables.global_variables_initializer())
+    sess.run(lookup_ops.tables_initializer())
+    sess.run(init_op)
+
   def _import_meta_graph(self):
     meta_file_path = self._ckpt_path() + ".meta"
     return saver_lib.import_meta_graph(meta_file_path)
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 5921be2ae89..06883934d04 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -194,6 +194,10 @@ class FilterDatasetSerializationTest(
     return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
         lambda x, i: x)
 
+  def testSparseCore(self):
+    num_outputs = 5
+    self.run_core_tests(self._build_sparse_filter, None, num_outputs)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index d4fbaa5cdcd..86d69495ef4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -225,6 +225,21 @@ class FlatMapDatasetSerializationTest(
 
     self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError)
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_ds():
+      return dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+
+    self.run_core_tests(_build_ds, None, 20)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index b1937c08f34..db8429512bf 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -252,6 +252,22 @@ class InterleaveDatasetSeriazationTest(
         None, num_outputs)
     # pylint: enable=g-long-lambda
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_dataset():
+      return dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+          _interleave_fn, cycle_length=1)
+
+    self.run_core_tests(_build_dataset, None, 20)
+
 
 class ParallelInterleaveDatasetTest(test.TestCase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index dd8247bfd47..d3ce89298be 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -805,6 +805,21 @@ class MapDatasetSerializationTest(
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
+  def testSparseCore(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def _build_ds(num_outputs):
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(_sparse)
+
+    num_outputs = 10
+    self.run_core_tests(lambda: _build_ds(num_outputs),
+                        lambda: _build_ds(int(num_outputs / 2)), num_outputs)
+
 
 class ParallelMapDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -851,7 +866,8 @@ class ParallelMapDatasetSerializationTest(
         return random_ops.random_uniform(
             (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(100).map(_map_fn)
+      return contrib_dataset_ops.Dataset.range(100).map(
+          _map_fn, num_parallel_calls=2).prefetch(2)
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -861,7 +877,8 @@ class ParallelMapDatasetSerializationTest(
       counter_var = variable_scope.get_variable(
           "counter", (), dtypes.int32, use_resource=True)
       return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
-          lambda _: counter_var.assign_add(1)))
+          lambda _: counter_var.assign_add(1),
+          num_parallel_calls=2).prefetch(2))
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -870,7 +887,7 @@ class ParallelMapDatasetSerializationTest(
     def _build_ds():
       constant_var = constant_op.constant(5)
       return (contrib_dataset_ops.Dataset.from_tensors(0).repeat(10).map(
-          lambda x: x + constant_var))
+          lambda x: x + constant_var, num_parallel_calls=2).prefetch(2))
 
     self.run_core_tests(_build_ds, None, 10)
 
@@ -883,7 +900,8 @@ class ParallelMapDatasetSerializationTest(
       def defun_fn(x):
         return constant_op.constant(1000) + math_ops.to_int32(x)
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
@@ -901,7 +919,8 @@ class ParallelMapDatasetSerializationTest(
 
         return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
 
-      return contrib_dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return contrib_dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 76c07b2c999..6eb512dec67 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -403,7 +403,7 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
     num_parallel_batches: A `tf.int64` scalar `tf.Tensor`, representing the
       number of batches to create in parallel. On one hand, higher values can
       help mitigate the effect of stragglers. On the other hand, higher values
-      can increasing contention if CPU is scarce.
+      can increase contention if CPU is scarce.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index 1dd0729513c..9cd1701c397 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
@@ -161,8 +162,10 @@ class _StatsDataset(dataset_ops.Dataset):
     return self._op_function(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index a255d4fc890..31d24aa9ea0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -23,10 +23,15 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import mixture
+from tensorflow.contrib.distributions.python.ops import mixture_same_family
+from tensorflow.contrib.distributions.python.ops import mvn_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.ops.linalg import linear_operator_diag
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -395,6 +400,41 @@ class MixtureStddevTest(test.TestCase):
     self.assertAllClose(actual_devs, expected_devs)
 
 
+class PadMixtureDimensionsTest(test.TestCase):
+
+  def test_pad_mixture_dimensions_mixture(self):
+    with self.test_session() as sess:
+      gm = mixture.Mixture(
+          cat=categorical.Categorical(probs=[[0.3, 0.7]]),
+          components=[
+              normal.Normal(loc=[-1.0], scale=[1.0]),
+              normal.Normal(loc=[1.0], scale=[0.5])
+          ])
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.cat, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+  def test_pad_mixture_dimensions_mixture_same_family(self):
+    with self.test_session() as sess:
+      gm = mixture_same_family.MixtureSameFamily(
+          mixture_distribution=categorical.Categorical(probs=[0.3, 0.7]),
+          components_distribution=mvn_diag.MultivariateNormalDiag(
+              loc=[[-1., 1], [1, -1]], scale_identity_multiplier=[1.0, 0.5]))
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.mixture_distribution, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2, 1])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+
 class _PadTest(object):
 
   def testNegAxisCorrectness(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
new file mode 100644
index 00000000000..ea3c86b5c0f
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
@@ -0,0 +1,388 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import kumaraswamy as kumaraswamy_lib
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
+
+
+def _kumaraswamy_mode(a, b):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return ((a - 1) / (a * b - 1))**(1 / a)
+
+
+def _kumaraswamy_moment(a, b, n):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return b * special.beta(1.0 + n / a, b)
+
+
+def _harmonic_number(b):
+  b = np.asarray(b)
+  return special.psi(b + 1) - special.psi(1)
+
+
+def _kumaraswamy_cdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return 1 - (1 - x**a)**b
+
+
+def _kumaraswamy_pdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return a * b * x ** (a - 1) * (1 - x ** a) ** (b - 1)
+
+
+class KumaraswamyTest(test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      a = np.random.rand(3)
+      b = np.random.rand(3)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape)
+
+  def testComplexShapes(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(3, 2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testComplexShapesBroadcast(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testAProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration1.get_shape())
+      self.assertAllClose(a, dist.concentration1.eval())
+
+  def testBProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration0.get_shape())
+      self.assertAllClose(b, dist.concentration0.eval())
+
+  def testPdfXProper(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, validate_args=True)
+      dist.prob([.1, .3, .6]).eval()
+      dist.prob([.2, .3, .5]).eval()
+      # Either condition can trigger.
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([-1., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([0., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be no larger than `1`"):
+        dist.prob([.1, .2, 1.2]).eval()
+
+  def testPdfTwoBatches(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.5, .5]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfTwoBatchesNontrivialX(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.3, .7]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfUniformZeroBatch(self):
+    with self.test_session():
+      # This is equivalent to a uniform distribution
+      a = 1.
+      b = 1.
+      x = np.array([.1, .2, .3, .5, .8], dtype=np.float32)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((5,), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2]]
+      b = [[1., 2]]
+      x = [[.5, .5], [.3, .7]]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [[.5, .5], [.2, .8]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [[.5, .5]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [.5, .5]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testKumaraswamyMean(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(expected_mean, dist.mean().eval())
+
+  def testKumaraswamyVariance(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(expected_variance, dist.variance().eval())
+
+  def testKumaraswamyMode(self):
+    with session.Session():
+      a = np.array([1.1, 2, 3])
+      b = np.array([2., 4, 1.2])
+      expected_mode = _kumaraswamy_mode(a, b)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mode().get_shape(), (3,))
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyModeInvalid(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+  def testKumaraswamyModeEnableAllowNanStats(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyEntropy(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = (1 - 1. / a) + (
+          1 - 1. / b) * _harmonic_number(b) + np.log(a * b)
+      self.assertAllClose(expected_entropy, dist.entropy().eval())
+
+  def testKumaraswamySample(self):
+    with self.test_session():
+      a = 1.
+      b = 2.
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000,))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertLess(
+          stats.kstest(
+              # Kumaraswamy is a univariate distribution.
+              sample_values,
+              lambda x: _kumaraswamy_cdf(1., 2., x))[0],
+          0.01)
+      # The standard error of the sample mean is 1 / (sqrt(18 * n))
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(sample_values.mean(axis=0), expected_mean, atol=1e-2)
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(
+          np.cov(sample_values, rowvar=0), expected_variance, atol=1e-1)
+
+  # Test that sampling with the same seed twice gives the same results.
+  def testKumaraswamySampleMultipleTimes(self):
+    with self.test_session():
+      a_val = 1.
+      b_val = 2.
+      n_val = 100
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy1 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy1")
+      samples1 = kumaraswamy1.sample(n_val, seed=123456).eval()
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy2 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy2")
+      samples2 = kumaraswamy2.sample(n_val, seed=123456).eval()
+
+      self.assertAllClose(samples1, samples2)
+
+  def testKumaraswamySampleMultidimensional(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2).astype(np.float32)
+      b = np.random.rand(3, 2, 2).astype(np.float32)
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertAllClose(
+          sample_values[:, 1, :].mean(axis=0),
+          _kumaraswamy_moment(a, b, 1)[1, :],
+          atol=1e-1)
+
+  def testKumaraswamyCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = kumaraswamy_lib.Kumaraswamy(a, b).cdf(x).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+  def testKumaraswamyLogCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = math_ops.exp(kumaraswamy_lib.Kumaraswamy(a,
+                                                          b).log_cdf(x)).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index 1e514fe0ff2..02064891758 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -107,7 +107,7 @@ def _test_capture_normal_sample_outputs():
   ds.Normal._call_sample_n = true_normal_call_sample_n
 
 
-def make_univariate_mixture(batch_shape, num_components):
+def make_univariate_mixture(batch_shape, num_components, use_static_graph):
   batch_shape = ops.convert_to_tensor(batch_shape, dtypes.int32)
   logits = random_ops.random_uniform(
       array_ops.concat((batch_shape, [num_components]), axis=0),
@@ -119,11 +119,11 @@ def make_univariate_mixture(batch_shape, num_components):
       for _ in range(num_components)
   ]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 def make_multivariate_mixture(batch_shape, num_components, event_shape,
-                              batch_shape_tensor=None):
+                              use_static_graph, batch_shape_tensor=None):
   if batch_shape_tensor is None:
     batch_shape_tensor = batch_shape
   batch_shape_tensor = ops.convert_to_tensor(batch_shape_tensor, dtypes.int32)
@@ -145,15 +145,17 @@ def make_multivariate_mixture(batch_shape, num_components, event_shape,
         loc=loc, scale_diag=scale_diag)
   components = [create_component() for _ in range(num_components)]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 class MixtureTest(test.TestCase):
+  use_static_graph = False
 
   def testShapes(self):
     with self.test_session():
       for batch_shape in ([], [1], [2, 3, 4]):
-        dist = make_univariate_mixture(batch_shape, num_components=10)
+        dist = make_univariate_mixture(batch_shape, num_components=10,
+                                       use_static_graph=self.use_static_graph)
         self.assertAllEqual(batch_shape, dist.batch_shape)
         self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
         self.assertAllEqual([], dist.event_shape)
@@ -161,7 +163,8 @@ class MixtureTest(test.TestCase):
 
         for event_shape in ([1], [2]):
           dist = make_multivariate_mixture(
-              batch_shape, num_components=10, event_shape=event_shape)
+              batch_shape, num_components=10, event_shape=event_shape,
+              use_static_graph=self.use_static_graph)
           self.assertAllEqual(batch_shape, dist.batch_shape)
           self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
           self.assertAllEqual(event_shape, dist.event_shape)
@@ -172,7 +175,8 @@ class MixtureTest(test.TestCase):
                                              r"cat.num_classes != len"):
       ds.Mixture(
           ds.Categorical([0.1, 0.5]),  # 2 classes
-          [ds.Normal(loc=1.0, scale=2.0)])
+          [ds.Normal(loc=1.0, scale=2.0)],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(
         ValueError, r"\(\) and \(2,\) are not compatible"):
       # The value error is raised because the batch shapes of the
@@ -185,13 +189,15 @@ class MixtureTest(test.TestCase):
                   loc=1.0, scale=2.0),  # scalar dist
               ds.Normal(
                   loc=[1.0, 1.0], scale=[2.0, 2.0])
-          ])
+          ],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, r"Could not infer"):
       cat_logits = array_ops.placeholder(shape=[1, None], dtype=dtypes.float32)
       ds.Mixture(
           ds.Categorical(cat_logits),
           [ds.Normal(
-              loc=[1.0], scale=[2.0])])
+              loc=[1.0], scale=[2.0])],
+          use_static_graph=self.use_static_graph)
 
   def testBrokenShapesDynamic(self):
     with self.test_session():
@@ -203,29 +209,37 @@ class MixtureTest(test.TestCase):
                   loc=d0_param, scale=d0_param), ds.Normal(
                       loc=d1_param, scale=d1_param)
           ],
-          validate_args=True)
-      with self.assertRaisesOpError(r"batch shape must match"):
+          validate_args=True,
+          use_static_graph=self.use_static_graph)
+
+      if self.use_static_graph:
+        error_string = r"Shapes of all inputs must match"
+      else:
+        error_string = r"batch shape must match"
+
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: [1.0]})
-      with self.assertRaisesOpError(r"batch shape must match"):
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: 1.0})
 
   def testBrokenTypes(self):
     with self.assertRaisesWithPredicateMatch(TypeError, "Categorical"):
-      ds.Mixture(None, [])
+      ds.Mixture(None, [], use_static_graph=self.use_static_graph)
     cat = ds.Categorical([0.3, 0.2])
     # components must be a list of distributions
     with self.assertRaisesWithPredicateMatch(
         TypeError, "all .* must be Distribution instances"):
-      ds.Mixture(cat, [None])
+      ds.Mixture(cat, [None], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(TypeError, "same dtype"):
       ds.Mixture(
           cat, [
               ds.Normal(loc=[1.0], scale=[2.0]),
               ds.Normal(loc=[np.float16(1.0)],
                         scale=[np.float16(2.0)]),
-          ])
+          ], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, "non-empty list"):
-      ds.Mixture(ds.Categorical([0.3, 0.2]), None)
+      ds.Mixture(ds.Categorical([0.3, 0.2]), None,
+                 use_static_graph=self.use_static_graph)
 
     # TODO(ebrevdo): once distribution Domains have been added, add a
     # test to ensure that the domains of the distributions in a
@@ -235,7 +249,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=2)
+            batch_shape=batch_shape, num_components=2,
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape, mean.get_shape())
 
@@ -256,7 +271,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape + (4,), mean.get_shape())
 
@@ -283,7 +299,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=num_components)
+            batch_shape=batch_shape, num_components=num_components,
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape, dev.get_shape())
 
@@ -325,7 +342,8 @@ class MixtureTest(test.TestCase):
         dist = make_multivariate_mixture(
             batch_shape=batch_shape,
             num_components=num_components,
-            event_shape=(4,))
+            event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape + (4,), dev.get_shape())
 
@@ -371,7 +389,8 @@ class MixtureTest(test.TestCase):
                       scale=component_devs[0]),
             ds.Normal(loc=component_means[1],
                       scale=component_devs[1]),
-        ])
+        ],
+        use_static_graph=self.use_static_graph)
     mix_dev = mixture_dist.stddev()
     with self.test_session() as sess:
       actual_stddev = sess.run(mix_dev)
@@ -379,7 +398,8 @@ class MixtureTest(test.TestCase):
 
   def testProbScalarUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[], num_components=2,
+                                     use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [1.0, 2.0], dtype=np.float32), np.array(
@@ -405,7 +425,8 @@ class MixtureTest(test.TestCase):
   def testProbScalarMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=2, event_shape=[3])
+          batch_shape=[], num_components=2, event_shape=[3],
+          use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [[-1.0, 0.0, 1.0], [0.5, 1.0, -0.3]], dtype=np.float32), np.array(
@@ -432,7 +453,8 @@ class MixtureTest(test.TestCase):
 
   def testProbBatchUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2,
+                                     use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3).astype(np.float32),
@@ -459,7 +481,8 @@ class MixtureTest(test.TestCase):
   def testProbBatchMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[2, 3], num_components=2, event_shape=[4])
+          batch_shape=[2, 3], num_components=2, event_shape=[4],
+          use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3, 4).astype(np.float32),
@@ -487,7 +510,8 @@ class MixtureTest(test.TestCase):
       num_components = 3
       batch_shape = []
       dist = make_univariate_mixture(
-          batch_shape=batch_shape, num_components=num_components)
+          batch_shape=batch_shape, num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -502,7 +526,10 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch univariate case: batch_size == 1, rank 1
-        which_dist_samples = dist_sample_values[c][:size_c]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c]
         self.assertAllClose(which_dist_samples, sample_values[which_c])
 
   # Test that sampling with the same seed twice gives the same results.
@@ -522,7 +549,8 @@ class MixtureTest(test.TestCase):
       ]
       cat = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat1")
-      dist1 = ds.Mixture(cat, components, name="mixture1")
+      dist1 = ds.Mixture(cat, components, name="mixture1",
+                         use_static_graph=self.use_static_graph)
       samples1 = dist1.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
@@ -532,7 +560,8 @@ class MixtureTest(test.TestCase):
       ]
       cat2 = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat2")
-      dist2 = ds.Mixture(cat2, components2, name="mixture2")
+      dist2 = ds.Mixture(cat2, components2, name="mixture2",
+                         use_static_graph=self.use_static_graph)
       samples2 = dist2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -541,7 +570,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       num_components = 3
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=num_components, event_shape=[2])
+          batch_shape=[], num_components=num_components, event_shape=[2],
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -555,14 +585,18 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch multivariate case: batch_size == 1, rank 2
-        which_dist_samples = dist_sample_values[c][:size_c, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c, :]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c, :]
         self.assertAllClose(which_dist_samples, sample_values[which_c, :])
 
   def testSampleBatchUnivariate(self):
     with self.test_session() as sess:
       num_components = 3
       dist = make_univariate_mixture(
-          batch_shape=[2, 3], num_components=num_components)
+          batch_shape=[2, 3], num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -576,8 +610,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 3
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1])
 
@@ -594,7 +632,8 @@ class MixtureTest(test.TestCase):
       dist = make_multivariate_mixture(
           batch_shape=batch_shape,
           num_components=num_components, event_shape=[4],
-          batch_shape_tensor=batch_shape_tensor)
+          batch_shape_tensor=batch_shape_tensor,
+          use_static_graph=self.use_static_graph)
       n = 5
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -617,8 +656,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 4 (multivariate)
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1, :]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1, :]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1, :])
 
@@ -632,7 +675,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         entropy_lower_bound = dist.entropy_lower_bound()
         self.assertEqual(batch_shape, entropy_lower_bound.get_shape())
 
@@ -673,7 +717,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=(), dtype=dtypes.float32)
 
@@ -721,7 +766,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=psize, dtype=dtypes.float32)
     xs_to_check = [
@@ -760,12 +806,18 @@ class MixtureTest(test.TestCase):
       gm = ds.Mixture(
           cat=ds.Categorical(probs=[.3, .7]),
           components=[ds.Gamma(1., 2.),
-                      ds.Gamma(2., 1.)])
+                      ds.Gamma(2., 1.)],
+          use_static_graph=self.use_static_graph)
       x_ = gm.sample().eval()
       self.assertAllEqual([], x_.shape)
 
 
+class MixtureStaticSampleTest(MixtureTest):
+  use_static_graph = True
+
+
 class MixtureBenchmark(test.Benchmark):
+  use_static_graph = False
 
   def _runSamplingBenchmark(self, name, create_distribution, use_gpu,
                             num_components, batch_size, num_features,
@@ -811,7 +863,7 @@ class MixtureBenchmark(test.Benchmark):
       components = list(
           ds.MultivariateNormalDiag(
               loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -853,7 +905,7 @@ class MixtureBenchmark(test.Benchmark):
           ds.MultivariateNormalTriL(
               loc=mu, scale_tril=linalg_ops.cholesky(sigma))
           for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -872,5 +924,9 @@ class MixtureBenchmark(test.Benchmark):
                   sample_size=sample_size)
 
 
+class MixtureStaticSampleBenchmark(MixtureBenchmark):
+  use_static_graph = True
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index dc8ae1eed19..5251dbcb574 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -237,6 +237,11 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
       return y
 
     event_size = array_ops.shape(x)[-1]
+    # If the event size is available at graph construction time, we can inform
+    # the graph compiler of the maximum number of steps. If not,
+    # static_event_size will be None, and the maximum_iterations argument will
+    # have no effect.
+    static_event_size = x.shape.with_rank_at_least(1)[-1].value
     y0 = array_ops.zeros_like(x, name="y0")
     # call the template once to ensure creation
     _ = self._shift_and_log_scale_fn(y0)
@@ -258,7 +263,8 @@ class MaskedAutoregressiveFlow(bijector_lib.Bijector):
     _, y = control_flow_ops.while_loop(
         cond=lambda index, _: index < event_size,
         body=_loop_body,
-        loop_vars=[0, y0])
+        loop_vars=(0, y0),
+        maximum_iterations=static_event_size)
     return y
 
   def _inverse(self, y):
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index a4d249d41ec..289e1d50e11 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -442,6 +443,44 @@ def maybe_check_scalar_distribution(
     return assertions
 
 
+def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution,
+                           event_ndims):
+  """Pad dimensions of event tensors for mixture distributions.
+
+  See `Mixture._sample_n` and `MixtureSameFamily._sample_n` for usage examples.
+
+  Args:
+    x: event tensor to pad.
+    mixture_distribution: Base distribution of the mixture.
+    categorical_distribution: `Categorical` distribution that mixes the base
+      distribution.
+    event_ndims: Integer specifying the number of event dimensions in the event
+      tensor.
+
+  Returns:
+    A padded version of `x` that can broadcast with `categorical_distribution`.
+  """
+  with ops.name_scope("pad_mix_dims", values=[x]):
+    def _get_ndims(d):
+      if d.batch_shape.ndims is not None:
+        return d.batch_shape.ndims
+      return array_ops.shape(d.batch_shape_tensor())[0]
+    dist_batch_ndims = _get_ndims(mixture_distribution)
+    cat_batch_ndims = _get_ndims(categorical_distribution)
+    pad_ndims = array_ops.where(
+        categorical_distribution.is_scalar_batch(),
+        dist_batch_ndims,
+        dist_batch_ndims - cat_batch_ndims)
+    s = array_ops.shape(x)
+    x = array_ops.reshape(x, shape=array_ops.concat([
+        s[:-1],
+        array_ops.ones([pad_ndims], dtype=dtypes.int32),
+        s[-1:],
+        array_ops.ones([event_ndims], dtype=dtypes.int32),
+    ], axis=0))
+    return x
+
+
 def static_value(x):
   """Returns the static value of a `Tensor` or `None`."""
   return tensor_util.constant_value(ops.convert_to_tensor(x))
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
new file mode 100644
index 00000000000..74d5d8773cf
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -0,0 +1,258 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Kumaraswamy distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import beta
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "Kumaraswamy",
+]
+
+_kumaraswamy_sample_note = """Note: `x` must have dtype `self.dtype` and be in
+`[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
+
+
+def _harmonic_number(x):
+  """Compute the harmonic number from its analytic continuation.
+
+  Derivation from [1] and Euler's constant [2].
+  [1] -
+  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers
+  [2] - https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant
+
+
+  Args:
+    x: input float.
+
+  Returns:
+    z: The analytic continuation of the harmonic number for the input.
+
+  """
+  one = array_ops.ones([], dtype=x.dtype)
+  return math_ops.digamma(x + one) - math_ops.digamma(one)
+
+
+@tf_export("distributions.Kumaraswamy")
+class Kumaraswamy(beta.Beta):
+  """Kumaraswamy distribution.
+
+  The Kumaraswamy distribution is defined over the `(0, 1)` interval using
+  parameters
+  `concentration1` (aka "alpha") and `concentration0` (aka "beta").  It has a
+  shape similar to the Beta distribution, but is reparameterizeable.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; alpha, beta) = alpha * beta * x**(alpha - 1) * (1 - x**alpha)**(beta -
+  1)
+  ```
+
+  where:
+
+  * `concentration1 = alpha`,
+  * `concentration0 = beta`,
+
+  Distribution parameters are automatically broadcast in all functions; see
+  examples for details.
+
+  #### Examples
+
+  ```python
+  # Create a batch of three Kumaraswamy distributions.
+  alpha = [1, 2, 3]
+  beta = [1, 2, 3]
+  dist = Kumaraswamy(alpha, beta)
+
+  dist.sample([4, 5])  # Shape [4, 5, 3]
+
+  # `x` has three batch entries, each with two samples.
+  x = [[.1, .4, .5],
+       [.2, .3, .5]]
+  # Calculate the probability of each pair of samples under the corresponding
+  # distribution in `dist`.
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  ```python
+  # Create batch_shape=[2, 3] via parameter broadcast:
+  alpha = [[1.], [2]]      # Shape [2, 1]
+  beta = [3., 4, 5]        # Shape [3]
+  dist = Kumaraswamy(alpha, beta)
+
+  # alpha broadcast as: [[1., 1, 1,],
+  #                      [2, 2, 2]]
+  # beta broadcast as:  [[3., 4, 5],
+  #                      [3, 4, 5]]
+  # batch_Shape [2, 3]
+  dist.sample([4, 5])  # Shape [4, 5, 2, 3]
+
+  x = [.2, .3, .5]
+  # x will be broadcast as [[.2, .3, .5],
+  #                         [.2, .3, .5]],
+  # thus matching batch_shape [2, 3].
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  """
+
+  def __init__(self,
+               concentration1=None,
+               concentration0=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Kumaraswamy"):
+    """Initialize a batch of Kumaraswamy distributions.
+
+    Args:
+      concentration1: Positive floating-point `Tensor` indicating mean
+        number of successes; aka "alpha". Implies `self.dtype` and
+        `self.batch_shape`, i.e.,
+        `concentration1.shape = [N1, N2, ..., Nm] = self.batch_shape`.
+      concentration0: Positive floating-point `Tensor` indicating mean
+        number of failures; aka "beta". Otherwise has same semantics as
+        `concentration1`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    super(Kumaraswamy, self).__init__(
+        concentration1=concentration1,
+        concentration0=concentration0,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._reparameterization_type = distribution.FULLY_REPARAMETERIZED
+
+  def _sample_n(self, n, seed=None):
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    uniform_sample = random_ops.random_uniform(
+        shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed)
+
+    kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**(
+        1. / expanded_concentration1)
+    return kumaraswamy_sample
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _log_cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return math_ops.log1p(-(1 - x**a)**b)
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return 1 - (1 - x**a)**b
+
+  def _survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - x**a)**b
+
+  def _log_survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return b * math_ops.log1p(-x**a)
+
+  def _log_unnormalized_prob(self, x):
+    x = self._maybe_assert_valid_sample(x)
+    a = self.concentration1
+    b = self.concentration0
+    return (a - 1) * math_ops.log(x) + (b - 1) * math_ops.log1p(-x**a)
+
+  def _log_normalization(self):
+    a = self.concentration1
+    b = self.concentration0
+    return -(math_ops.log(a) + math_ops.log(b))
+
+  def _entropy(self):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - 1. / a) + (
+        1 - 1. / b) * _harmonic_number(b) + math_ops.log(a) + math_ops.log(b)
+
+  def _moment(self, n):
+    """Compute the n'th (uncentered) moment."""
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    beta_arg0 = 1 + n / expanded_concentration1
+    beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1)
+    log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta(
+        beta_arg)
+    return math_ops.exp(log_moment)
+
+  def _mean(self):
+    return self._moment(1)
+
+  def _variance(self):
+    # TODO(b/72696533): Investigate a more numerically stable version.
+    return self._moment(2) - math_ops.square(self._moment(1))
+
+  @distribution_util.AppendDocstring(
+      """Note: The mode is undefined when `concentration1 <= 1` or
+      `concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
+      is used for undefined modes. If `self.allow_nan_stats` is `False` an
+      exception is raised when one or more modes are undefined.""")
+  def _mode(self):
+    a = self.concentration1
+    b = self.concentration0
+    mode = ((a - 1) / (a * b - 1))**(1. / a)
+    if self.allow_nan_stats:
+      nan = array_ops.fill(
+          self.batch_shape_tensor(),
+          np.array(np.nan, dtype=self.dtype.as_numpy_dtype),
+          name="nan")
+      is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.)
+      return array_ops.where(is_defined, mode, nan)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration1,
+            message="Mode undefined for concentration1 <= 1."),
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration0,
+            message="Mode undefined for concentration0 <= 1.")
+    ], mode)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index f2d492f5489..cef6a143fc6 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -71,6 +71,7 @@ class Mixture(distribution.Distribution):
                components,
                validate_args=False,
                allow_nan_stats=True,
+               use_static_graph=False,
                name="Mixture"):
     """Initialize a Mixture distribution.
 
@@ -96,6 +97,11 @@ class Mixture(distribution.Distribution):
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
         batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
+      use_static_graph: Calls to `sample` will not rely on dynamic tensor
+        indexing, allowing for some static graph compilation optimizations, but
+        at the expense of sampling all underlying distributions in the mixture.
+        (Possibly useful when running on TPUs).
+        Default value: `False` (i.e., use dynamic indexing).
       name: A name for this distribution (optional).
 
     Raises:
@@ -178,6 +184,10 @@ class Mixture(distribution.Distribution):
       self._static_event_shape = static_event_shape
       self._static_batch_shape = static_batch_shape
 
+      self._use_static_graph = use_static_graph
+      if use_static_graph and static_num_components is None:
+        raise ValueError("Number of categories must be known statically when "
+                         "`static_sample=True`.")
     # We let the Mixture distribution access _graph_parents since its arguably
     # more like a baseclass.
     graph_parents = self._cat._graph_parents  # pylint: disable=protected-access
@@ -292,6 +302,31 @@ class Mixture(distribution.Distribution):
       return mixture_log_cdf
 
   def _sample_n(self, n, seed=None):
+    if self._use_static_graph:
+      # This sampling approach is almost the same as the approach used by
+      # `MixtureSameFamily`. The differences are due to having a list of
+      # `Distribution` objects rather than a single object, and maintaining
+      # random seed management that is consistent with the non-static code path.
+      samples = []
+      cat_samples = self.cat.sample(n, seed=seed)
+      for c in range(self.num_components):
+        seed = distribution_util.gen_new_seed(seed, "mixture")
+        samples.append(self.components[c].sample(n, seed=seed))
+      x = array_ops.stack(
+          samples, -self._static_event_shape.ndims - 1)     # [n, B, k, E]
+      npdt = x.dtype.as_numpy_dtype
+      mask = array_ops.one_hot(
+          indices=cat_samples,                              # [n, B]
+          depth=self._num_components,                       # == k
+          on_value=np.ones([], dtype=npdt),
+          off_value=np.zeros([], dtype=npdt))               # [n, B, k]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self._cat,
+          self._static_event_shape.ndims)                   # [n, B, k, [1]*e]
+      return math_ops.reduce_sum(
+          x * mask,
+          axis=-1 - self._static_event_shape.ndims)         # [n, B, E]
+
     with ops.control_dependencies(self._assertions):
       n = ops.convert_to_tensor(n, name="n")
       static_n = tensor_util.constant_value(n)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 49afbea7f05..b93bdc5ab40 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import dtypes
+from tensorflow.contrib.distributions.python.ops import distribution_util as distribution_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -239,7 +239,9 @@ class MixtureSameFamily(distribution.Distribution):
           depth=self._num_components,                        # == k
           on_value=np.ones([], dtype=npdt),
           off_value=np.zeros([], dtype=npdt))                # [n, B, k]
-      mask = self._pad_mix_dims(mask)                        # [n, B, k, [1]*e]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [n, B, k, [1]*e]
       return math_ops.reduce_sum(
           x * mask, axis=-1 - self._event_ndims)             # [n, B, E]
 
@@ -254,8 +256,9 @@ class MixtureSameFamily(distribution.Distribution):
 
   def _mean(self):
     with ops.control_dependencies(self._runtime_assertions):
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       return math_ops.reduce_sum(
           probs * self.components_distribution.mean(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -271,8 +274,9 @@ class MixtureSameFamily(distribution.Distribution):
   def _variance(self):
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.variance(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -291,8 +295,12 @@ class MixtureSameFamily(distribution.Distribution):
 
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(self._pad_mix_dims(
-          self.mixture_distribution.probs))                  # [B, k, 1, 1]
+      probs = distribution_utils.pad_mixture_dimensions(
+          distribution_utils.pad_mixture_dimensions(
+              self.mixture_distribution.probs, self, self.mixture_distribution,
+              self._event_shape().ndims),
+          self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, 1, 1]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.covariance(),
           axis=-3)                                           # [B, e, e]
@@ -312,27 +320,6 @@ class MixtureSameFamily(distribution.Distribution):
           shape[:d], [1], shape[d:]], axis=0))
       return x
 
-  def _pad_mix_dims(self, x):
-    with ops.name_scope("pad_mix_dims", values=[x]):
-      def _get_ndims(d):
-        if d.batch_shape.ndims is not None:
-          return d.batch_shape.ndims
-        return array_ops.shape(d.batch_shape_tensor())[0]
-      dist_batch_ndims = _get_ndims(self)
-      cat_batch_ndims = _get_ndims(self.mixture_distribution)
-      pad_ndims = array_ops.where(
-          self.mixture_distribution.is_scalar_batch(),
-          dist_batch_ndims,
-          dist_batch_ndims - cat_batch_ndims)
-      s = array_ops.shape(x)
-      x = array_ops.reshape(x, shape=array_ops.concat([
-          s[:-1],
-          array_ops.ones([pad_ndims], dtype=dtypes.int32),
-          s[-1:],
-          array_ops.ones([self._event_ndims], dtype=dtypes.int32),
-      ], axis=0))
-      return x
-
 
 def _outer_squared_difference(x, y):
   """Convenience function analogous to tf.squared_difference."""
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 09242ee47dd..9d2ca07c3a2 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -41,28 +41,8 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Since eager execution is not yet part of a TensorFlow release, using it requires
-either [building from source](https://www.tensorflow.org/install/install_sources)
-or the latest nightly builds. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment. Try out the notebooks below.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 544a3eafc08..d177bfeab2d 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -112,7 +112,7 @@ class Iterator(object):
         remote_fn.add_to_graph(None)
         target = constant_op.constant("/device:CPU:0")
       with ops.device(self._device):
-        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(
+        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(  # pylint: disable=line-too-long
             string_arg=iter_string_handle,
             f=remote_fn,
             target_device=target,
@@ -120,8 +120,9 @@ class Iterator(object):
             thread_pool_size=1,
             container="",
             shared_name=_generate_shared_name("function_buffer_resource"))
-        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(
-            handle=self._buffer_resource_handle, handle_device=self._device)
+        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
+            handle=self._buffer_resource_handle,
+            handle_device=self._device)
 
   def __iter__(self):
     return self
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index 2a7be95811f..ed7dbc89046 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -95,8 +95,7 @@ class MNISTModel(tfe.Network):
     x = self.max_pool2d(x)
     x = tf.layers.flatten(x)
     x = self.fc1(x)
-    if training:
-      x = self.dropout(x)
+    x = self.dropout(x, training=training)
     x = self.fc2(x)
     return x
 
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 76e06269b6b..1f7beee6852 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -22,6 +22,7 @@ import gc
 import tempfile
 import time
 
+from six.moves import xrange
 import tensorflow as tf
 
 import tensorflow.contrib.eager as tfe
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 84e25cf81a2..19b0104c807 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -26,6 +26,7 @@ import tempfile
 import time
 
 import numpy as np
+from six.moves import xrange
 import tensorflow as tf
 
 # pylint: disable=g-bad-import-order
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 7eea93ce1f5..ffc1d0332ea 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -19,29 +19,34 @@ to models defined without using eager execution.
 
 ## Installation
 
-Eager execution is **not** included in the latest release (version 1.4) of
-TensorFlow. To use it, you will need to [build TensorFlow from
-source](https://www.tensorflow.org/install/install_sources) or install the
-nightly builds.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
-For example, the nightly builds can be installed using `pip`:
+The contents of this guide are compatible with TensorFlow 1.5.
+However, if you run into bugs that are fixed in source but not the
+release, you may want to either either [building from
+source](https://www.tensorflow.org/install/install_sources)
+or the try latest nightly builds. The nightly builds are available as:
 
--   `pip install tf-nightly` (for CPU-only TensorFlow)
--   `pip install tf-nightly-gpu` (for GPU-enabled TensorFlow)
+- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
 
-Or using `docker`, with [Jupyter Notebook](http://jupyter.org/) support:
+- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
+
+For example, to run the latest nightly docker image:
 
 ```sh
-# For CPU-only TensorFlow
+# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
+docker pull tensorflow/tensorflow:nightly-gpu
+docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
+
+# If you do not have a GPU, use the CPU-only image
 docker pull tensorflow/tensorflow:nightly
 docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-
-# For GPU-enabled TensorFlow:
-# (Requires https://github.com/NVIDIA/nvidia-docker)
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
 ```
 
+And then visit http://localhost:8888 in your browser for a Jupyter notebook
+environment.
+
 ## Getting Started
 
 With TensorFlow installed, eager execution is enabled via a single call:
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index bf029ca5f9d..ea8dbf2b46e 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -291,6 +291,9 @@ class Mean(Metric):
     Args:
       values: Tensor with the per-example value.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     if weights is None:
       self.denom.assign_add(
@@ -302,6 +305,9 @@ class Mean(Metric):
       self.denom.assign_add(math_ops.reduce_sum(weights))
       values = math_ops.cast(values, self.dtype) * weights
       self.numer.assign_add(math_ops.reduce_sum(values))
+    if weights is None:
+      return values
+    return values, weights
 
   def result(self):
     t = self.numer / self.denom
@@ -329,7 +335,13 @@ class Accuracy(Mean):
         per element of the Tensor.
       predictions: Tensor with the predicted label for each example.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     matches = math_ops.equal(labels, predictions)
     matches = math_ops.cast(matches, dtypes.float64)
     super(Accuracy, self).call(matches, weights=weights)
+    if weights is None:
+      return labels, predictions
+    return labels, predictions, weights
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 9cf34fd9b2d..a9ecaa3f8bc 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -180,6 +180,19 @@ class MetricsTest(test.TestCase):
         m2 = metrics.Mean()
         m2(2)
 
+  def testMetricsChain(self):
+    with context.graph_mode(), self.test_session():
+      m1 = metrics.Mean()
+      m2 = metrics.Mean(name="m2")
+      update_m2 = m2(3.0)
+      update_m2_2 = m2(m1(1.0))
+      m1.init_variables().run()
+      m2.init_variables().run()
+      update_m2.eval()
+      update_m2_2.eval()
+      self.assertAllEqual(m2.result().eval(), 2.0)
+      self.assertAllEqual(m1.result().eval(), 1.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index abc7e3690c7..1a7f7b85e68 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -73,16 +73,6 @@ class SaverTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'v1'):
         saver.save(ckpt_prefix)
 
-  def testDifferentGraphError(self):
-    with ops.device(self._dev()):
-      with ops.Graph().as_default():
-        v1 = resource_variable_ops.ResourceVariable(1.0, name='v1')
-      with ops.Graph().as_default():
-        saver = _saver.Saver([v1])
-        ckpt_prefix = os.path.join(test.get_temp_dir(), 'ckpt')
-        with self.assertRaisesRegexp(ValueError, 'Graph'):
-          saver.save(ckpt_prefix)
-
   def testSameObjectOK(self):
     with ops.device(self._dev()):
       v1 = resource_variable_ops.ResourceVariable(1.0, name='v1')
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 712d1cb94d2..d32bebf90c1 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -59,7 +59,6 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@in_eager_mode
 @@in_graph_mode
 
-@@IsolateTest
 @@run_test_in_graph_and_eager_modes
 
 @@DEVICE_PLACEMENT_EXPLICIT
@@ -101,7 +100,6 @@ from tensorflow.python.eager.execution_callbacks import nan_callback
 from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.framework.ops import enable_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
-from tensorflow.python.framework.test_util import IsolateTest
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 0dedb2fd7c0..b6659c2a179 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -102,10 +102,6 @@ class TFETest(test_util.TensorFlowTestCase):
     # Expect at least one device.
     self.assertTrue(tfe.list_devices())
 
-  def testNumGPUs(self):
-    devices = tfe.list_devices()
-    self.assertEqual(len(devices) - 1, tfe.num_gpus())
-
   def testAddCheckNumericsOpsRaisesError(self):
     with self.assertRaisesRegexp(
         RuntimeError,
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index caa9dd83233..c9153c93527 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -457,6 +457,13 @@ def _get_local_devices(device_type):
 def _split_batch(features, labels, number_of_shards, device):
   """Split input features and labes into batches."""
 
+  def ensure_divisible_by_shards(sequence):
+    batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
+    if batch_size % number_of_shards != 0:
+      raise ValueError(
+          'Batch size {} needs to be divisible by the number of GPUs, which '
+          'is {}.'.format(batch_size, number_of_shards))
+
   def split_dictionary(dictionary):
     """Split a dictionary into shards."""
     shards = [{} for _ in range(number_of_shards)]
@@ -467,6 +474,7 @@ def _split_batch(features, labels, number_of_shards, device):
                 sp_input=tensor, num_split=number_of_shards, axis=0)):
           shards[i][name] = shard
       else:
+        ensure_divisible_by_shards(tensor)
         for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
           shards[i][name] = shard
     return shards
@@ -476,6 +484,7 @@ def _split_batch(features, labels, number_of_shards, device):
       if isinstance(features, dict):
         feature_shards = split_dictionary(features)
       else:
+        ensure_divisible_by_shards(features)
         feature_shards = array_ops.split(features, number_of_shards)
 
       if labels is None:
@@ -483,6 +492,7 @@ def _split_batch(features, labels, number_of_shards, device):
       elif isinstance(labels, dict):
         label_shards = split_dictionary(labels)
       else:
+        ensure_divisible_by_shards(labels)
         label_shards = array_ops.split(labels, number_of_shards)
   return feature_shards, label_shards
 
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index 03d31226af6..6936f8a1312 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -433,6 +434,17 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
           'probabilities': np.array([[0.1], [0.02]])
       }, session.run(estimator_spec.predictions))
 
+  def test_batch_size_that_is_not_divisible_by_the_number_of_gpus(self):
+    features = np.array([[1.0], [2.0], [3.0]])
+    labels = np.array([[1.0], [2.0], [3.0]])
+
+    with self.assertRaisesRegexp(
+        ValueError, '.*Batch.+size.+needs.+to.+be.+divisible.+by.+GPUs.+'):
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/gpu:0', '/gpu:1'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
   def test_unsupported_loss_reduction(self):
     with self.assertRaisesRegexp(ValueError,
                                  '.+none.+reduction.+is.+specified.+'):
@@ -981,8 +993,13 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
     return list(map(evaluate_items, first_list)), list(
         map(evaluate_items, second_list))
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
   def test_simple_half_split(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -995,7 +1012,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
 
   def test_to_each_their_own(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1008,7 +1025,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
 
   def test_one_batch(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -1021,7 +1038,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
 
   def test_half_split_in_dictionary(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
       labels = [10.0, 11.0, 12.0, 13.0]
 
@@ -1035,6 +1052,60 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([10.0, 11.0], label_shards[0].eval())
       self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
 
+  def test_sparse_tensor_can_be_split_unevenly(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 2], [2, 2]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 2]], values=[1., 2.], dense_shape=[2, 4]),
+          feature_shards[0]['x'].eval())
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 2]], values=[3.], dense_shape=[1, 4]),
+          feature_shards[1]['x'].eval())
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
+  def test_sparse_tensor_can_be_split_unevenly_repeated_row(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 0], [1, 1]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      print(feature_shards[0]['x'].eval())
+      print(feature_shards[1]['x'].eval())
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 0], [1, 1]],
+              values=[1., 2., 3.],
+              dense_shape=[2, 4]), feature_shards[0]['x'].eval())
+
+      second_batch = feature_shards[1]['x'].eval()
+      self.assertFalse(len(second_batch.indices))
+      self.assertFalse(len(second_batch.values))
+      self.assertAllEqual([1, 4], second_batch.dense_shape)
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
   def test_one_batch_in_dictionary(self):
     with self.test_session() as session:  # pylint: disable=unused-variable
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index fe86a20ab1f..180f1b68f3b 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -221,6 +221,7 @@ py_test(
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/ops/kmeans_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67512932
     deps = [
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
index 31d08bfb65e..a8c5d0763c2 100644
--- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -57,11 +57,11 @@ typedef Eigen::Map<
 
 class MaskedMatmulOp : public OpKernel {
  public:
-  explicit MaskedMatmulOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->MatchSignature(
-        {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL},
-        {DT_FLOAT}));
+  explicit MaskedMatmulOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(
+        context,
+        context->MatchSignature(
+            {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL}, {DT_FLOAT}));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -110,12 +110,11 @@ class MaskedMatmulOp : public OpKernel {
                                       num_nonzero_elements, 2);
 
     Tensor* prod_values_tensor;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0, TensorShape({num_nonzero_elements}),
-                       &prod_values_tensor));
-    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(),
-                                 1, num_nonzero_elements);
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({num_nonzero_elements}),
+                                &prod_values_tensor));
+    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(), 1,
+                                 num_nonzero_elements);
 
     auto get_a_index = [&indices_mat, &a_dim_0](int64 i) {
       int64 a_index = internal::SubtleMustCopy(indices_mat(i, 0));
@@ -182,8 +181,8 @@ class MaskedMatmulOp : public OpKernel {
       }
     };
     // Shard the work.
-    worker_threads.workers->ParallelFor(
-        num_nonzero_elements, cost_per_unit, work);
+    worker_threads.workers->ParallelFor(num_nonzero_elements, cost_per_unit,
+                                        work);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("MaskedMatmul").Device(DEVICE_CPU),
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 4d0f9b24240..c861cfff544 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -143,7 +143,7 @@ class _ModelFn(object):
   def model_fn(self, features, mode, config):
     """Model function for the estimator.
 
-    Note that this does not take a `1abels` arg. This works, but `input_fn` must
+    Note that this does not take a `labels` arg. This works, but `input_fn` must
     return either `features` or, equivalently, `(features, None)`.
 
     Args:
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc
index d44032968d5..6f8ad486d10 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc
@@ -102,16 +102,12 @@ REGISTER_OP("DecodeVideo")
       return Status::OK();
     })
     .Doc(R"doc(
-Processes the contents of an audio file into a tensor using FFmpeg to decode
+Processes the contents of an video file into a tensor using FFmpeg to decode
 the file.
 
-One row of the tensor is created for each channel in the audio file. Each
-channel contains audio samples starting at the beginning of the audio and
-having `1/samples_per_second` time between them. If the `channel_count` is
-different from the contents of the file, channels will be merged or created.
-
-contents: The binary audio file contents, as a string or rank-0 string
-    tensor.
+contents: The binary contents of the video file to decode. This is a
+    scalar.
+output: A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
 )doc");
 
 }  // namespace ffmpeg
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index c85b1837ab5..e61221a6b0d 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -47,20 +47,19 @@ std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
                                            int32 channel_count,
                                            const string& stream) {
   std::vector<string> command({
-          "-nostats",             // No additional progress display.
-          "-nostdin",             // No interactive commands accepted.
-          "-f", input_format_id,  // eg: "mp3"
-          "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
-          "-loglevel", "error",   // Print errors only.
-          "-hide_banner",         // Skip printing build options, version, etc.
-          "-map_metadata", "-1",  // Copy global metadata from input to output.
-          "-vn",                  // No video recording.
-          "-ac:a:0", StrCat(channel_count), "-ar:a:0",
-          StrCat(samples_per_second),
-          // Output set (in several ways) to signed 16-bit little-endian ints.
-          "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
-          "-sn",  // No subtitle recording.
-          "-y"   // Overwrite output file.
+      "-nostats",             // No additional progress display.
+      "-nostdin",             // No interactive commands accepted.
+      "-f", input_format_id,  // eg: "mp3"
+      "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
+      "-loglevel", "error",   // Print errors only.
+      "-hide_banner",         // Skip printing build options, version, etc.
+      "-map_metadata", "-1",  // Copy global metadata from input to output.
+      "-vn",                  // No video recording.
+      "-ac:a:0", StrCat(channel_count), "-ar:a:0", StrCat(samples_per_second),
+      // Output set (in several ways) to signed 16-bit little-endian ints.
+      "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
+      "-sn",  // No subtitle recording.
+      "-y"    // Overwrite output file.
   });
   if (!stream.empty()) {
     command.emplace_back("-map");
@@ -75,21 +74,13 @@ std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
                                            const string& output_filename) {
   return {"-nostats",  // No additional progress display.
           "-nostdin",  // No interactive commands accepted.
-          "-i",
-          input_filename,
-          "-f",
-          "image2pipe",
-          "-probesize",
-          StrCat(kDefaultProbeSize),
-          "-loglevel",
+          "-i", input_filename, "-f", "image2pipe", "-probesize",
+          StrCat(kDefaultProbeSize), "-loglevel",
           // Info is needed to get the information about stream, etc.
           // It is generated to a separate file, not stdout/stderr.
           "info",
           "-hide_banner",  // Skip printing build options, version, etc.
-          "-vcodec",
-          "rawvideo",
-          "-pix_fmt",
-          "rgb24",
+          "-vcodec", "rawvideo", "-pix_fmt", "rgb24",
           "-y",  // Overwrite output file.
           StrCat(output_filename)};
 }
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 85b61b26163..05728b3d375 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -32,10 +32,8 @@ namespace tensorflow {
 namespace ffmpeg {
 namespace {
 
-const char kTestWavFilename[] =
-    "contrib/ffmpeg/testdata/mono_10khz.wav";
-const char kTestMp3Filename[] =
-    "contrib/ffmpeg/testdata/test_sound1.mp3";
+const char kTestWavFilename[] = "contrib/ffmpeg/testdata/mono_10khz.wav";
+const char kTestMp3Filename[] = "contrib/ffmpeg/testdata/test_sound1.mp3";
 
 // Set to true via a command line flag iff the test is expected to have FFmpeg
 // installed.
@@ -139,7 +137,7 @@ TEST(FfmpegLibTest, TestRoundTripWav) {
 }  // namespace ffmpeg
 }  // namespace tensorflow
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   tensorflow::string usage = tensorflow::ffmpeg::ParseTestFlags(&argc, argv);
   testing::InitGoogleTest(&argc, argv);
   if (argc != 1) {
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
index 36fc71794b0..d6c885a3242 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
@@ -20,8 +20,6 @@
 #include <string>
 #include <vector>
 
-
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 673c5178422..503b868aaa6 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -53,6 +53,7 @@ See the @{$python/contrib.framework} guide.
 @@assign_from_values_fn
 @@create_global_step
 @@filter_variables
+@@fuse_op
 @@get_global_step
 @@get_or_create_global_step
 @@get_local_variables
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
index 6677dca752f..5bf6b675295 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/framework/kernels/zero_initializer_op.h"
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -81,8 +81,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #undef REGISTER_KERNELS
 
-} // namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.h b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
index 14c9268efa8..99389a5ab6a 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.h
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
@@ -29,5 +29,5 @@ struct TensorSetZero {
 };
 }  // namespace functor
 
-} // end namespace tensorflow
-#endif // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc
index 1ee8e1498cf..706134ba9a5 100644
--- a/tensorflow/contrib/framework/ops/variable_ops.cc
+++ b/tensorflow/contrib/framework/ops/variable_ops.cc
@@ -26,8 +26,8 @@ REGISTER_OP("ZeroInitializer")
     .Attr("T: realnumbertype")
     .SetAllowsUninitializedInput()
     .SetShapeFn([](InferenceContext* c) {
-        c->set_output(0, c->input(0));
-        return Status::OK();
+      c->set_output(0, c->input(0));
+      return Status::OK();
     })
     .Doc(R"doc(
 Initialize 'ref' with all zeros. This op requires that the tensor is not
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
index b5e9f8df792..6f65fe771eb 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
   """Tests of the new, differentiable version of accumulate_n"""
 
@@ -62,8 +61,9 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
         accum_n = av2.accumulate_n_v2(input_vars)
         sess.run(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
-        self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
-                            [g.eval() for g in accum_n_grad])
+        self.assertAllEqual(
+            np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
+            [g.eval() for g in accum_n_grad])
 
   # The tests below used to be in a separate class under cwise_ops_test.py,
   # which did not run in the default test target.
@@ -75,8 +75,8 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = av2.accumulate_n_v2(random_tensors)
       np_val = random_arrays[0]
@@ -95,21 +95,21 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         a = variables.Variable(0.2)
         b = variables.Variable(0.1)
-        tf_val = av2.accumulate_n_v2([a,b], shape=[2,2]) # Should be shape=[]
+        tf_val = av2.accumulate_n_v2([a, b], shape=[2, 2])  # Should be shape=[]
 
   def testIncompatibleShapes(self):
     with self.test_session():
       with self.assertRaises(ValueError):
-        a = variables.Variable(np.array([0.1,0.2]))
-        b = variables.Variable(np.array([[0.3],[0.4]]))
-        tf_val = av2.accumulate_n_v2([a,b])
+        a = variables.Variable(np.array([0.1, 0.2]))
+        b = variables.Variable(np.array([[0.3], [0.4]]))
+        tf_val = av2.accumulate_n_v2([a, b])
 
   def testWrongType(self):
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
         b = variables.Variable(0.1, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32)
+        tf_val = av2.accumulate_n_v2([a, b], tensor_dtype=np.int32)
 
   def testWrongTypeOneInput(self):
     # Scenario that used to trigger a bug, even when testWrongType() worked
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 2bce00fde24..409657fe1da 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -53,7 +53,8 @@
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
 
-  Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:
+  Example of how to use tf.contrib.framework.add_arg_scope to enable your
+  function to be called within an arg_scope later:
 
   @tf.contrib.framework.add_arg_scope
   def conv2d(*args, **kwargs)
@@ -65,11 +66,10 @@ from __future__ import print_function
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 
-__all__ = ['arg_scope',
-           'add_arg_scope',
-           'current_arg_scope',
-           'has_arg_scope',
-           'arg_scoped_arguments']
+__all__ = [
+    'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope',
+    'arg_scoped_arguments'
+]
 
 _ARGSTACK = [{}]
 
@@ -172,6 +172,7 @@ def add_arg_scope(func):
   Returns:
     A tuple with the decorated function func_with_args().
   """
+
   def func_with_args(*args, **kwargs):
     current_scope = current_arg_scope()
     current_args = kwargs
@@ -180,6 +181,7 @@ def add_arg_scope(func):
       current_args = current_scope[key_func].copy()
       current_args.update(kwargs)
     return func(*args, **current_args)
+
   _add_op(func)
   setattr(func_with_args, '_key_op', _key_op(func))
   return tf_decorator.make_decorator(func, func_with_args)
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
index a65d4bc50ff..96cdd8b1ca4 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
@@ -116,7 +116,7 @@ def build_fused_conv_bias_relu_graph(device, input_shape, filter_shape, strides,
     for _ in range(1, num_iters):
       with ops.control_dependencies([fused_out]):
         # pylint: disable=g-line-too-long
-        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(  # pylint: disable=line-too-long
             inp,
             filt,
             bias,
@@ -166,10 +166,10 @@ class FusedConv2DBiasActivationBenchmark(test.Benchmark):
         duration = (time.time() - start_time) / num_iters
 
         print("%s inputshape:%s filtershape:%s strides:%s padding:%s "
-              "%d iters: %.8f sec" %
-              (device, str(input_shape).replace(" ", ""),
-               str(filter_shape).replace(" ", ""),
-               str(strides).replace(" ", ""), padding, num_iters, duration))
+              "%d iters: %.8f sec" % (device, str(input_shape).replace(" ", ""),
+                                      str(filter_shape).replace(" ", ""),
+                                      str(strides).replace(" ", ""), padding,
+                                      num_iters, duration))
     name_template = (
         "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_"
         "strides_{strides}_padding_{padding}")
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 0d51c282a89..082c42eba18 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -59,7 +59,11 @@ _summary_type_map = {
 class GANEstimator(estimator.Estimator):
   """An estimator for Generative Adversarial Networks (GANs).
 
-  This Estimator is backed by TFGAN.
+  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
+  except for one exception: if either `generator_fn` or `discriminator_fn` have
+  an argument called `mode`, then the tf.Estimator mode is passed in for that
+  argument. This helps with operations like batch normalization, which have
+  different train and evaluation behavior.
 
   Example:
 
@@ -233,9 +237,11 @@ def _gan_model_fn(
 def _make_gan_model(generator_fn, discriminator_fn, real_data,
                     generator_inputs, generator_scope, add_summaries, mode):
   """Make a `GANModel`, and optionally pass in `mode`."""
-  # If `generator_fn` has an argument `mode`, pass mode to it.
+  # If network functions have an argument `mode`, pass mode to it.
   if 'mode' in inspect.getargspec(generator_fn).args:
     generator_fn = functools.partial(generator_fn, mode=mode)
+  if 'mode' in inspect.getargspec(discriminator_fn).args:
+    discriminator_fn = functools.partial(discriminator_fn, mode=mode)
   gan_model = tfgan_train.gan_model(
       generator_fn,
       discriminator_fn,
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index e752f0bcccd..387a62bd741 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -54,7 +54,8 @@ def generator_fn(noise_dict, mode):
   return layers.fully_connected(noise, noise.shape[1].value)
 
 
-def discriminator_fn(data, _):
+def discriminator_fn(data, unused_conditioning, mode):
+  del unused_conditioning, mode
   return layers.fully_connected(data, 1)
 
 
@@ -99,7 +100,6 @@ def mock_head(testcase, expected_generator_inputs, expected_real_data,
     else:
       testcase.assertEqual(discriminator_scope_name,
                            gan_model.discriminator_scope.name)
-    testcase.assertEqual(_or_none(discriminator_fn), gan_model.discriminator_fn)
 
     with ops.control_dependencies(assertions):
       if mode == model_fn_lib.ModeKeys.TRAIN:
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index 986a5ff6dcb..d9b07e62f89 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -28,6 +28,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 import sys
 import tarfile
 
@@ -189,20 +190,31 @@ def get_graph_def_from_resource(filename):
   return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename))
 
 
-def get_graph_def_from_url_tarball(url, filename):
-  """Get a GraphDef proto from a tarball on the web."""
-  def _progress(count, block_size, total_size):
-    sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-        url, float(count * block_size) / float(total_size) * 100.0))
-    sys.stdout.flush()
-  tar_filename, _ = urllib.request.urlretrieve(url, reporthook=_progress)
+def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
+  """Get a GraphDef proto from a tarball on the web.
+
+  Args:
+    url: Web address of tarball
+    filename: Filename of graph definition within tarball
+    tar_filename: Temporary download filename (None = always download)
+
+  Returns:
+    A GraphDef loaded from a file in the downloaded tarball.
+  """
+  if not (tar_filename and os.path.exists(tar_filename)):
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' % (
+          url, float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+    tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress)
   with tarfile.open(tar_filename, 'r:gz') as tar:
     proto_str = tar.extractfile(filename).read()
   return graph_pb2.GraphDef.FromString(proto_str)
 
 
 def _default_graph_def_fn():
-  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH)
+  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH,
+                                        os.path.basename(INCEPTION_URL))
 
 
 def run_inception(images,
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
index b960af28eaa..871f1ad54e2 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
@@ -84,11 +84,11 @@ class ClassifierMetricsTest(test.TestCase):
     self.assertAllClose(
         np.array([0.014, 0.014], 'f'),
         np.array([x[0] for x in wscores], 'f'),
-        rtol=0.1)
+        rtol=0.15)
     self.assertAllClose(
         np.array([0.014, 0.020], 'f'),
         np.array([x[1] for x in wscores], 'f'),
-        rtol=0.1)
+        rtol=0.15)
 
   def test_sliced_wasserstein_distance_svd(self):
     """Test the distance."""
diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md
index 34ce60b3608..8242d93f129 100644
--- a/tensorflow/contrib/gdr/README.md
+++ b/tensorflow/contrib/gdr/README.md
@@ -119,4 +119,4 @@ In the original design (as in the reference), tensor buffers are only registered
 Reference
 ===
 
-Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3123907
+Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3131975
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 5c7ac744289..81e70ae30a4 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -86,8 +86,9 @@ int TryToReadNumaNode(ibv_device* device) {
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
       LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       return 0;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -290,8 +291,8 @@ Status GdrMemoryManager::Init() {
   // Host memory allocators
   for (Allocator* allocator : allocators) {
     auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator) << "is not visitable for instrumentation"
-                               << allocator->Name();
+    CHECK(visitable_allocator)
+        << "is not visitable for instrumentation" << allocator->Name();
     // Make sure we don't instrument the same allocator twice
     if (instrumented_.find(allocator) == std::end(instrumented_)) {
       visitable_allocator->AddAllocVisitor(alloc_visitor);
@@ -635,8 +636,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
     } else {
       checksum = GPUUtil::Checksum(*tensor);
     }
-    CHECK(checksum == remote_mr.checksum()) << "Checksum mismatch: " << checksum
-                                            << "!=" << remote_mr.checksum();
+    CHECK(checksum == remote_mr.checksum())
+        << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
 #endif
   }
   done(Status::OK());
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 6adf837ca0a..c2e32da133b 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -43,9 +43,9 @@ template struct FillProjectiveTransform<CPUDevice, double>;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
+using generator::Interpolation;
 using generator::INTERPOLATION_BILINEAR;
 using generator::INTERPOLATION_NEAREST;
-using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -72,11 +72,12 @@ class ImageProjectiveTransform : public OpKernel {
     const Tensor& transform_t = ctx->input(1);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
-    OP_REQUIRES(ctx, (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
-                      (transform_t.dim_size(0) == images_t.dim_size(0) ||
-                       transform_t.dim_size(0) == 1) &&
-                      transform_t.dim_size(1) ==
-                          ProjectiveGenerator<Device, T>::kNumParameters),
+    OP_REQUIRES(ctx,
+                (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
+                 (transform_t.dim_size(0) == images_t.dim_size(0) ||
+                  transform_t.dim_size(0) == 1) &&
+                 transform_t.dim_size(1) ==
+                     ProjectiveGenerator<Device, T>::kNumParameters),
                 errors::InvalidArgument(
                     "Input transform should be num_images x 8 or 1 x 8"));
     auto images = images_t.tensor<T, 4>();
diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
index 9f0bf37aed3..8f9a5c28039 100755
--- a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
@@ -143,8 +143,8 @@ class SingleImageRandomDotStereogramsOp : public OpKernel {
     }
 
     data_box_left = deltaX_border_image / 2;  // Center DATA in X dimension
-    data_box_width = data_Xwindow;             // width of scan line
-    data_box_height = data_Ywindow;            // hight of image
+    data_box_width = data_Xwindow;            // width of scan line
+    data_box_height = data_Ywindow;           // hight of image
 
     const T* inputZ = input_tensor.flat<T>().data();  // Flatten input Z buffer
 
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 1f41f243f2e..8139d4272d6 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -58,7 +58,9 @@ REGISTER_OP("SingleImageRandomDotStereograms")
       int colors;
       TF_RETURN_IF_ERROR(c->GetAttr("number_colors", &colors));
 
-      c->set_output(0, c->MakeShape({y_dim, x_dim, colors > 256? c->MakeDim(3) : c->MakeDim(1)}));
+      c->set_output(
+          0, c->MakeShape(
+                 {y_dim, x_dim, colors > 256 ? c->MakeDim(3) : c->MakeDim(1)}));
       return Status::OK();
     })
     .Doc(R"doc(
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index bb766e59d2c..d4a6a5bcbb5 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -26,18 +26,20 @@ _sirds_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile(
         "_single_image_random_dot_stereograms.so"))
 
-def single_image_random_dot_stereograms(
-    depth_values,
-    hidden_surface_removal=None,
-    convergence_dots_size=None,
-    dots_per_inch=None,
-    eye_separation=None, mu=None,
-    normalize=None, normalize_max=None,
-    normalize_min=None,
-    border_level=None,
-    number_colors=None,
-    output_image_shape=None,
-    output_data_window=None):
+
+def single_image_random_dot_stereograms(depth_values,
+                                        hidden_surface_removal=None,
+                                        convergence_dots_size=None,
+                                        dots_per_inch=None,
+                                        eye_separation=None,
+                                        mu=None,
+                                        normalize=None,
+                                        normalize_max=None,
+                                        normalize_min=None,
+                                        border_level=None,
+                                        number_colors=None,
+                                        output_image_shape=None,
+                                        output_data_window=None):
   """Output a RandomDotStereogram Tensor for export via encode_PNG/JPG OP.
 
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
@@ -45,7 +47,8 @@ def single_image_random_dot_stereograms(
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
   corrupt the encode 3-D data witin the image.
 
-  Based upon [this paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
+  Based upon [this
+  paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
 
   This outputs a SIRDS image as picture_out.png:
 
@@ -113,7 +116,8 @@ def single_image_random_dot_stereograms(
       hidden_surface_removal=hidden_surface_removal,
       convergence_dots_size=convergence_dots_size,
       dots_per_inch=dots_per_inch,
-      eye_separation=eye_separation, mu=mu,
+      eye_separation=eye_separation,
+      mu=mu,
       normalize=normalize,
       normalize_max=normalize_max,
       normalize_min=normalize_min,
@@ -123,4 +127,5 @@ def single_image_random_dot_stereograms(
       output_data_window=output_data_window)
   return result
 
+
 ops.NotDifferentiable("SingleImageRandomDotStereograms")
diff --git a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
index ca288c1f737..886f6798150 100644
--- a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
+++ b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
@@ -34,9 +34,8 @@ class ObtainNextOp : public OpKernel {
 
     // Allocate output.
     Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx,
-        ctx->allocate_output("out_element", TensorShape({}), &output_tensor));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("out_element", TensorShape({}),
+                                             &output_tensor));
 
     // Obtain mutex for the "counter" tensor.
     mutex* mu;
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
new file mode 100644
index 00000000000..f7593aa462c
--- /dev/null
+++ b/tensorflow/contrib/kafka/BUILD
@@ -0,0 +1,104 @@
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_kernel_library(
+    name = "kafka_kernels",
+    srcs = ["kernels/kafka_dataset_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:dataset",
+        "//third_party/eigen3",
+        "@kafka//:kafka",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["kafka_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_kafka_ops",
+    out = "python/ops/gen_kafka_ops.py",
+    require_shape_functions = True,
+    deps = [":kafka_ops_op_lib"],
+)
+
+py_library(
+    name = "kafka",
+    srcs = [
+        "__init__.py",
+        "python/ops/kafka_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gen_kafka_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+# The Kafka server has to be setup before running the test.
+# The Kafka server is setup through Docker so the Docker engine
+# has to be installed.
+#
+# Once the Docker engine is ready:
+# To setup the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh start kafka
+#
+# After the test is complete:
+# To team down the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh stop kafka
+tf_py_test(
+    name = "kafka_test",
+    srcs = ["python/kernel_tests/kafka_test.py"],
+    additional_deps = [
+        ":kafka",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/kafka/__init__.py b/tensorflow/contrib/kafka/__init__.py
new file mode 100644
index 00000000000..4d755c40568
--- /dev/null
+++ b/tensorflow/contrib/kafka/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset.
+
+@@KafkaDataset
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops.kafka_dataset_ops import KafkaDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "KafkaDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
new file mode 100644
index 00000000000..88ef5f35711
--- /dev/null
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -0,0 +1,321 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/tensor.h"
+
+#include "src-cpp/rdkafkacpp.h"
+
+namespace tensorflow {
+
+class KafkaDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* topics_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("topics", &topics_tensor));
+    OP_REQUIRES(
+        ctx, topics_tensor->dims() <= 1,
+        errors::InvalidArgument("`topics` must be a scalar or a vector."));
+
+    std::vector<string> topics;
+    topics.reserve(topics_tensor->NumElements());
+    for (int i = 0; i < topics_tensor->NumElements(); ++i) {
+      topics.push_back(topics_tensor->flat<string>()(i));
+    }
+
+    std::string servers = "";
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<std::string>(ctx, "servers", &servers));
+    std::string group = "";
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "group", &group));
+    bool eof = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "eof", &eof));
+    int64 timeout = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "timeout", &timeout));
+    OP_REQUIRES(ctx, (timeout > 0),
+                errors::InvalidArgument(
+                    "Timeout value should be large than 0, got ", timeout));
+    *output = new Dataset(ctx, std::move(topics), servers, group, eof, timeout);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> topics,
+            const string& servers, const string& group, const bool eof,
+            const int64 timeout)
+        : GraphDatasetBase(ctx),
+          topics_(std::move(topics)),
+          servers_(servers),
+          group_(group),
+          eof_(eof),
+          timeout_(timeout) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Kafka")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "KafkaDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* topics = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics));
+      Node* servers = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(servers_, &servers));
+      Node* group = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(group_, &group));
+      Node* eof = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(eof_, &eof));
+      Node* timeout = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(timeout_, &timeout));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {topics, servers, group, eof, timeout}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a topic, so try to read the next line.
+          if (consumer_.get()) {
+            while (true) {
+              if (limit_ >= 0 &&
+                  (topic_partition_->offset() >= limit_ || offset_ >= limit_)) {
+                // EOF current topic
+                break;
+              }
+              std::unique_ptr<RdKafka::Message> message(
+                  consumer_->consume(dataset()->timeout_));
+              if (message->err() == RdKafka::ERR_NO_ERROR) {
+                // Produce the line as output.
+                Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+                line_tensor.scalar<string>()() =
+                    std::string(static_cast<const char*>(message->payload()),
+                                message->len());
+                out_tensors->emplace_back(std::move(line_tensor));
+                *end_of_sequence = false;
+                // Sync offset
+                offset_ = message->offset();
+                return Status::OK();
+              }
+
+              if (message->err() == RdKafka::ERR__PARTITION_EOF &&
+                  dataset()->eof_) {
+                // EOF current topic
+                break;
+              }
+              if (message->err() != RdKafka::ERR__TIMED_OUT) {
+                return errors::Internal("Failed to consume:",
+                                        message->errstr());
+              }
+              message.reset(nullptr);
+              consumer_->poll(0);
+            }
+
+            // We have reached the end of the current topic, so maybe
+            // move on to next topic.
+            ResetStreamsLocked();
+            ++current_topic_index_;
+          }
+
+          // Iteration ends when there are no more topic to process.
+          if (current_topic_index_ == dataset()->topics_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_topic_index"),
+                                               current_topic_index_));
+
+        // `consumer_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All topics have been read and iterator has been exhausted.
+        if (consumer_.get()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("current_pos"), offset_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        ResetStreamsLocked();
+        int64 current_topic_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_topic_index"),
+                                              &current_topic_index));
+        current_topic_index_ = size_t(current_topic_index);
+        // The key "current_pos" is written only if the iterator was saved
+        // with an open topic.
+        if (reader->Contains(full_name("current_pos"))) {
+          int64 current_pos;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+          topic_partition_->set_offset(current_pos);
+          if (topic_partition_->offset() != current_pos) {
+            return errors::Internal("Failed to restore to offset ",
+                                    current_pos);
+          }
+          offset_ = current_pos;
+        }
+        return Status::OK();
+      }
+
+     private:
+      // Sets up Kafka streams to read from the topic at
+      // `current_topic_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_topic_index_ >= dataset()->topics_.size()) {
+          return errors::InvalidArgument(
+              "current_topic_index_:", current_topic_index_,
+              " >= topics_.size():", dataset()->topics_.size());
+        }
+
+        // Actually move on to next topic.
+        string entry = dataset()->topics_[current_topic_index_];
+
+        std::vector<string> parts = str_util::Split(entry, ":");
+        if (parts.size() < 1) {
+          return errors::InvalidArgument("Invalid parameters: ", entry);
+        }
+        string topic = parts[0];
+        int32 partition = 0;
+        if (parts.size() > 1) {
+          if (!strings::safe_strto32(parts[1], &partition)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+        int64 offset = 0;
+        if (parts.size() > 2) {
+          if (!strings::safe_strto64(parts[2], &offset)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        topic_partition_.reset(
+            RdKafka::TopicPartition::create(topic, partition, offset));
+
+        offset_ = topic_partition_->offset();
+        limit_ = -1;
+        if (parts.size() > 3) {
+          if (!strings::safe_strto64(parts[3], &limit_)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        std::unique_ptr<RdKafka::Conf> conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL));
+        std::unique_ptr<RdKafka::Conf> topic_conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_TOPIC));
+
+        std::string errstr;
+
+        RdKafka::Conf::ConfResult result =
+            conf->set("default_topic_conf", topic_conf.get(), errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set default_topic_conf:", errstr);
+        }
+
+        result = conf->set("bootstrap.servers", dataset()->servers_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set bootstrap.servers ",
+                                  dataset()->servers_, ":", errstr);
+        }
+        result = conf->set("group.id", dataset()->group_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set group.id ", dataset()->group_,
+                                  ":", errstr);
+        }
+
+        consumer_.reset(RdKafka::KafkaConsumer::create(conf.get(), errstr));
+        if (!consumer_.get()) {
+          return errors::Internal("Failed to create consumer:", errstr);
+        }
+
+        std::vector<RdKafka::TopicPartition*> partitions;
+        partitions.emplace_back(topic_partition_.get());
+        RdKafka::ErrorCode err = consumer_->assign(partitions);
+        if (err != RdKafka::ERR_NO_ERROR) {
+          return errors::Internal(
+              "Failed to assign partition [", topic_partition_->topic(), ", ",
+              topic_partition_->partition(), ", ", topic_partition_->offset(),
+              "]:", RdKafka::err2str(err));
+        }
+
+        return Status::OK();
+      }
+
+      // Resets all Kafka streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        consumer_->unassign();
+        consumer_->close();
+        consumer_.reset(nullptr);
+      }
+
+      mutex mu_;
+      size_t current_topic_index_ GUARDED_BY(mu_) = 0;
+      int64 offset_ GUARDED_BY(mu_) = 0;
+      int64 limit_ GUARDED_BY(mu_) = -1;
+      std::unique_ptr<RdKafka::TopicPartition> topic_partition_ GUARDED_BY(mu_);
+      std::unique_ptr<RdKafka::KafkaConsumer> consumer_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> topics_;
+    const std::string servers_;
+    const std::string group_;
+    const bool eof_;
+    const int64 timeout_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("KafkaDataset").Device(DEVICE_CPU),
+                        KafkaDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/kafka_ops.cc
new file mode 100644
index 00000000000..8cdf16103ba
--- /dev/null
+++ b/tensorflow/contrib/kafka/ops/kafka_ops.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KafkaDataset")
+    .Input("topics: string")
+    .Input("servers: string")
+    .Input("group: string")
+    .Input("eof: bool")
+    .Input("timeout: int64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the messages of one or more Kafka topics.
+
+topics: A `tf.string` tensor containing one or more subscriptions,
+  in the format of [topic:partition:offset:length],
+  by default length is -1 for unlimited.
+servers: A list of bootstrap servers.
+group: The consumer group id.
+eof: If True, the kafka reader will stop on EOF.
+timeout: The timeout value for the Kafka Consumer to wait
+  (in millisecond).
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
new file mode 100644
index 00000000000..94cf6b5ace6
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
@@ -0,0 +1,117 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for KafkaDataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from tensorflow.contrib.kafka.python.ops import kafka_dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+class KafkaDatasetTest(test.TestCase):
+
+  def setUp(self):
+    # The Kafka server has to be setup before the test
+    # and tear down after the test manually.
+    # The docker engine has to be installed.
+    #
+    # To setup the Kafka server:
+    # $ bash kafka_test.sh start kafka
+    #
+    # To team down the Kafka server:
+    # $ bash kafka_test.sh stop kafka
+    pass
+
+  def testKafkaDataset(self):
+    topics = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = kafka_dataset_ops.KafkaDataset(
+        topics, group="test", eof=True).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from topic 0.
+      sess.run(
+          init_op, feed_dict={topics: ["test:0:0:4"],
+                              num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D"+str(i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from topic 1.
+      sess.run(
+          init_op, feed_dict={topics: ["test:0:5:-1"],
+                              num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D"+str(i + 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both topics.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                                   num_epochs: 1})
+      for j in range(2):
+        for i in range(5):
+          self.assertEqual("D"+str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                                   num_epochs: 10})
+      for _ in range(10):
+        for j in range(2):
+          for i in range(5):
+            self.assertEqual("D"+str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(
+          init_batch_op,
+          feed_dict={topics: ["test:0:0:4", "test:0:5:-1"],
+                     num_epochs: 10,
+                     batch_size: 5})
+      for _ in range(10):
+        self.assertAllEqual(["D"+str(i) for i in range(5)],
+                            sess.run(get_next))
+        self.assertAllEqual(["D"+str(i + 5) for i in range(5)],
+                            sess.run(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
new file mode 100644
index 00000000000..7997c127311
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+set -e
+set -o pipefail
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+container=$2
+if [ "$1" == "start" ]; then
+    docker run -d --rm --net=host --name=$container spotify/kafka
+    echo Wait 5 secs until kafka is up and running
+    sleep 5
+    echo Create test topic
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test'
+    echo Create test message
+    docker exec $container bash -c 'echo -e "D0\nD1\nD2\nD3\nD4\nD5\nD6\nD7\nD8\nD9" > /test'
+    echo Produce test message
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-console-producer.sh --topic test --broker-list 127.0.0.1:9092 < /test'
+
+    echo Container $container started successfully
+elif [ "$1" == "stop" ]; then
+    docker rm -f $container
+
+    echo Container $container stopped successfully
+else
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+
+
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
new file mode 100644
index 00000000000..e561f595a40
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops import gen_kafka_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.data.ops.readers import Dataset
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import resource_loader
+
+class KafkaDataset(Dataset):
+  """A Kafka Dataset that consumes the message.
+  """
+
+  def __init__(
+      self, topics, servers="localhost", group="", eof=False, timeout=1000):
+    """Create a KafkaReader.
+
+    Args:
+      topics: A `tf.string` tensor containing one or more subscriptions,
+              in the format of [topic:partition:offset:length],
+              by default length is -1 for unlimited.
+      servers: A list of bootstrap servers.
+      group: The consumer group id.
+      eof: If True, the kafka reader will stop on EOF.
+      timeout: The timeout value for the Kafka Consumer to wait
+               (in millisecond).
+    """
+    super(KafkaDataset, self).__init__()
+    self._topics = ops.convert_to_tensor(
+        topics, dtype=dtypes.string, name="topics")
+    self._servers = ops.convert_to_tensor(
+        servers, dtype=dtypes.string, name="servers")
+    self._group = ops.convert_to_tensor(
+        group, dtype=dtypes.string, name="group")
+    self._eof = ops.convert_to_tensor(
+        eof, dtype=dtypes.bool, name="eof")
+    self._timeout = ops.convert_to_tensor(
+        timeout, dtype=dtypes.int64, name="timeout")
+
+  def _as_variant_tensor(self):
+    return gen_kafka_ops.kafka_dataset(
+        self._topics, self._servers, self._group, self._eof, self._timeout)
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
diff --git a/tensorflow/contrib/kernel_methods/python/losses_test.py b/tensorflow/contrib/kernel_methods/python/losses_test.py
index d38d8041ce1..72507539f81 100644
--- a/tensorflow/contrib/kernel_methods/python/losses_test.py
+++ b/tensorflow/contrib/kernel_methods/python/losses_test.py
@@ -119,19 +119,20 @@ class SparseMulticlassHingeLossTest(test.TestCase):
 
   def testUnknownShape(self):
     """Result keeps same with `testZeroLossInt32Labels`"""
-    logits_np = np.array([[1.2, -1.4, -1.0],
-                          [1.4, 1.8, 4.0],
-                          [0.5, 1.8, -1.0]])
+    logits_np = np.array([[1.2, -1.4, -1.0], [1.4, 1.8, 4.0], [0.5, 1.8, -1.0]])
     labels_np = np.array([0, 2, 1], dtype=np.int32)
 
-    logits_shapes = [[3, 3],  # batch_size, num_classes
-                     [None, 3],
-                     [3, None],
-                     [None, None]]
+    logits_shapes = [
+        [3, 3],  # batch_size, num_classes
+        [None, 3],
+        [3, None],
+        [None, None]
+    ]
 
     for batch_size, num_classes in logits_shapes:
       with self.test_session():
-        logits = array_ops.placeholder(dtypes.float32, shape=(batch_size, num_classes))
+        logits = array_ops.placeholder(
+            dtypes.float32, shape=(batch_size, num_classes))
         labels = array_ops.placeholder(dtypes.int32, shape=(batch_size,))
         loss = losses.sparse_multiclass_hinge_loss(labels, logits)
         result = loss.eval(feed_dict={logits: logits_np, labels: labels_np})
diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py
index 0f0dbb53f45..87eed03888c 100644
--- a/tensorflow/contrib/kfac/examples/mlp.py
+++ b/tensorflow/contrib/kfac/examples/mlp.py
@@ -317,7 +317,10 @@ def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
     return tf.estimator.EstimatorSpec(
         mode=mode, loss=loss, train_op=train_op, training_hooks=hooks)
 
+  run_config = tf.estimator.RunConfig(
+      model_dir="/tmp/mnist", save_checkpoints_steps=1, keep_checkpoint_max=100)
+
   # Train until input_fn() is empty with Estimator. This is a prerequisite for
   # TPU compatibility.
-  estimator = tf.estimator.Estimator(model_fn=model_fn)
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
   estimator.train(input_fn=input_fn)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index 9436caf9618..0d2fa706f58 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -457,7 +457,9 @@ class ConvDiagonalFB(FisherBlock):
     self._num_locations = (
         inputs_shape[1] * inputs_shape[2] //
         (self._strides[1] * self._strides[2]))
-    self._damping = normalize_damping(damping, self._num_locations)
+
+    self._damping = (self._num_locations
+                     * normalize_damping(damping, self._num_locations))
 
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvDiagonalFactor,
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index f59168cbc05..bcba18ae147 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -111,6 +112,54 @@ def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: di
   return array_ops.ones(shape, dtype)
 
 
+def extract_image_patches(image, ksizes, strides, padding, name=None):
+  """Extracts image patches for an N-dimensional convolution.
+
+  This function is a compatibility wrapper over tf.extract_image_patches(), as
+  ExtractImagePatches isn't yet implemented in XLA.
+
+  Args:
+    image: Tensor of shape [batch, in_x, in_y, ..., in_channels]. Input images.
+      All dimensions except 'batch' must be defined.
+    ksizes: [filter_x, filter_y, ...]. Spatial shape of filter in each
+      dimension.
+    strides: [stride_x, stride_y, ...]. Spatial stride for filter in each
+      dimension.
+    padding: str. "VALID" or "SAME".
+    name: str or None. name of Op.
+
+  Returns:
+    result: [batch, out_x, out_y, ..., filter_x, filter_y, ..., in_channels].
+      Contains image patches to which conv kernel would be applied for each
+      output location. [out_x, out_y, ...] depends on padding.
+  """
+  if not utils.on_tpu():
+    return array_ops.extract_image_patches(
+        image,
+        ksizes=([1] + list(ksizes) + [1]),
+        strides=([1] + list(strides) + [1]),
+        rates=[1, 1, 1, 1],
+        padding=padding,
+        name=name)
+
+  with tf_ops.name_scope(name, "extract_image_patches",
+                         [image, ksizes, strides, padding]):
+    batch = image.shape.as_list()[0]
+    in_channels = image.shape.as_list()[-1]
+
+    # Map each input feature to a location in the output.
+    out_channels = np.prod(ksizes) * in_channels
+    filters = linalg_ops.eye(out_channels),
+    filters = array_ops.reshape(filters, ksizes + [in_channels, out_channels])
+
+    result = nn.convolution(image, filters, padding, strides=strides)
+    out_spatial = result.shape.as_list()[1:-1]
+    result = array_ops.reshape(
+        result, [batch or -1] + out_spatial + ksizes + [in_channels])
+
+    return result
+
+
 def compute_cov(tensor, tensor_right=None, normalizer=None):
   """Compute the empirical second moment of the rows of a 2D Tensor.
 
@@ -668,11 +717,10 @@ class ConvDiagonalFactor(DiagonalFactor):
 
       # TODO(b/64144716): there is potential here for a big savings in terms
       # of memory use.
-      patches = array_ops.extract_image_patches(
+      patches = extract_image_patches(
           self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
           padding=self._padding)
 
       if self._has_bias:
@@ -816,11 +864,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
 
       # TODO(b/64144716): there is potential here for a big savings in terms of
       # memory use.
-      patches = array_ops.extract_image_patches(
+      patches = extract_image_patches(
           self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
           padding=self._padding)
 
       flatten_size = (filter_height * filter_width * in_channels)
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
index cc48e3c69f2..fe8e39c212c 100644
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py
@@ -24,6 +24,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
+    "set_global_constants",
     "SequenceDict",
     "tensors_to_column",
     "column_to_tensors",
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index 932c5ab9924..01893d60615 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -423,8 +423,9 @@ class SparseFeatureCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index c8e3307ee8b..fb7b2e315ef 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -60,12 +60,12 @@ __all__ = [
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
     'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
     'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
-           'dropout', 'elu', 'flatten',
-    'fully_connected', 'GDN', 'gdn', 'layer_norm', 'linear', 'pool',
-    'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
-    'scale_gradient', 'separable_conv2d', 'separable_convolution2d', 'softmax',
-    'spatial_softmax', 'stack', 'unit_norm', 'legacy_fully_connected',
-    'legacy_linear', 'legacy_relu', 'maxout'
+    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'layer_norm',
+    'linear', 'pool', 'max_pool2d', 'max_pool3d', 'one_hot_encoding', 'relu',
+    'relu6', 'repeat', 'scale_gradient', 'separable_conv2d',
+    'separable_convolution2d', 'softmax', 'spatial_softmax', 'stack',
+    'unit_norm', 'legacy_fully_connected', 'legacy_linear', 'legacy_relu',
+    'maxout'
 ]
 
 DATA_FORMAT_NCHW = 'NCHW'
@@ -1418,7 +1418,9 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   with variable_scope.variable_scope(
       scope, 'dense_to_sparse', [tensor]) as sc:
     tensor = ops.convert_to_tensor(tensor)
-    indices = array_ops.where(math_ops.not_equal(tensor, constant_op.constant(eos_token, tensor.dtype)))
+    indices = array_ops.where(
+        math_ops.not_equal(
+            tensor, constant_op.constant(eos_token, tensor.dtype)))
     values = array_ops.gather_nd(tensor, indices)
     shape = array_ops.shape(tensor, out_type=dtypes.int64)
     outputs = sparse_tensor.SparseTensor(indices, values, shape)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index c5790c76221..8945690db8e 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -127,8 +127,8 @@ class AvgPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateAvgPool(self):
@@ -148,7 +148,8 @@ class AvgPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.avg_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['AvgPool3D'])
     self.assertEqual(output_collected, output)
@@ -183,7 +184,8 @@ class AvgPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalAvgPool(self):
     depth, height, width = 3, 6, 9
@@ -515,7 +517,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3])
         net = layers_lib.convolution2d(net, 32, [3, 3])
       self.assertEqual(len(variables.get_variables()), 8)
@@ -529,7 +533,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3], scope='Conv')
         net = layers_lib.convolution2d(
             net, 32, [3, 3], scope='Conv', reuse=True)
@@ -1031,7 +1037,8 @@ class Convolution2dTransposeTests(test.TestCase):
     for _ in range(10):
       num_filters = 1
       input_size = [
-          1, np.random.randint(1, max_image_size),
+          1,
+          np.random.randint(1, max_image_size),
           np.random.randint(1, max_image_size), 1
       ]
       filter_size = [
@@ -1185,8 +1192,10 @@ class ConvolutionInPlaneTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(init_op)
-      result = sess.run(horz_gradients,
-                        feed_dict={image: np.ones((1, 10, 10, 1))})
+      result = sess.run(
+          horz_gradients, feed_dict={
+              image: np.ones((1, 10, 10, 1))
+          })
       expected = np.zeros((1, 10, 9, 1))
 
       self.assertAllEqual(result, expected)
@@ -1299,7 +1308,8 @@ class DenseToSparseTest(test.TestCase):
     expected_constant = np.reshape(np.arange(24, dtype=np.int64), (3, 4, 2))
     tensor = constant_op.constant(expected_constant)
     sparse = _layers.dense_to_sparse(tensor)
-    dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape, sparse.values)
+    dense = sparse_ops.sparse_to_dense(
+        sparse.indices, sparse.dense_shape, sparse.values)
     with self.test_session() as sess:
       constant = sess.run(dense)
       self.assertAllEqual(expected_constant, constant)
@@ -1418,8 +1428,7 @@ class FlattenTest(test.TestCase):
     with ops.Graph().as_default() as g, self.test_session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5,)))
-      with self.assertRaisesRegexp(ValueError,
-                                   'incompatible with the layer'):
+      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
         _layers.flatten(inputs)
 
   def testUnknownLastDim(self):
@@ -1729,7 +1738,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27)
         net = _layers.fully_connected(net, 27)
       self.assertEqual(len(variables.get_variables()), 8)
@@ -1745,7 +1756,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27, scope='fc1')
         net = _layers.fully_connected(net, 27, scope='fc1', reuse=True)
       self.assertEqual(len(variables.get_variables()), 4)
@@ -1762,8 +1775,8 @@ class BatchNormTest(test.TestCase):
   def testBatchNormCenterFalse(self):
     a = array_ops.placeholder(dtype=dtypes.float32, shape=(10, 10, 10, 10))
     # Test that center=False builds a valid graph.
-    _layers.batch_norm(a, center=False, data_format='NCHW',
-                       zero_debias_moving_mean=True)
+    _layers.batch_norm(
+        a, center=False, data_format='NCHW', zero_debias_moving_mean=True)
 
   def testUnknownShape(self):
     with ops.Graph().as_default() as g, self.test_session(g):
@@ -1800,8 +1813,8 @@ class BatchNormTest(test.TestCase):
       images = np.random.uniform(size=(5, height, width, 3)).astype(
           dtype.as_numpy_dtype)
       output = _layers.batch_norm(images, fused=fused)
-      expected_name = ('BatchNorm/FusedBatchNorm' if fused else
-                       'BatchNorm/batchnorm')
+      expected_name = ('BatchNorm/FusedBatchNorm'
+                       if fused else 'BatchNorm/batchnorm')
       self.assertTrue(output.op.name.startswith(expected_name))
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
       self.assertEqual(
@@ -2020,8 +2033,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2540,8 +2553,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2571,8 +2584,9 @@ class BatchNormTest(test.TestCase):
         np_output, new_images_gradients = sess.run([output, images_gradients])
         # The outputs should be close to 0.0 mean and 1.0 variance
         self.assertAllClose(
-            np.mean(
-                np_output, axis=axis), [0] * channels, rtol=0.001, atol=0.001)
+            np.mean(np_output, axis=axis), [0] * channels,
+            rtol=0.001,
+            atol=0.001)
         self.assertAllClose(
             np.var(np_output, axis=axis), [1] * channels, rtol=0.01, atol=0.01)
         # The gradients should change slowly while updating moving_mean.
@@ -2600,14 +2614,14 @@ class BatchNormTest(test.TestCase):
     channels = 3
     with self.test_session() as sess:
       images = (np.ones((5, height, width, channels)) * 9.0).astype('f')
-      beta = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      gamma = init_ops.constant_initializer((np.ones(channels) * 2.0).astype(
-          'f'))
-      mean = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      variance = init_ops.constant_initializer((np.ones(channels) * 4.0).astype(
-          'f'))
+      beta = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      gamma = init_ops.constant_initializer(
+          (np.ones(channels) * 2.0).astype('f'))
+      mean = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      variance = init_ops.constant_initializer(
+          (np.ones(channels) * 4.0).astype('f'))
       output = _layers.batch_norm(
           images,
           is_training=False,
@@ -2628,21 +2642,18 @@ class BatchNormTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       images = np.arange(np.product(shape), dtype=np.float32).reshape(shape)
       beta = init_ops.constant_initializer(
-          np.arange(
-              2, channels + 2, dtype=np.float32))
+          np.arange(2, channels + 2, dtype=np.float32))
       gamma = init_ops.constant_initializer(
-          np.arange(
-              10, channels + 10, dtype=np.float32) * 2.0)
+          np.arange(10, channels + 10, dtype=np.float32) * 2.0)
       mean = init_ops.constant_initializer(
-          np.arange(
-              3, channels + 3, dtype=np.float32) * 5.0)
+          np.arange(3, channels + 3, dtype=np.float32) * 5.0)
       variance = init_ops.constant_initializer(
-          np.arange(
-              1, channels + 1, dtype=np.float32) * 4.0)
+          np.arange(1, channels + 1, dtype=np.float32) * 4.0)
       if data_format == 'NCHW':
         # Reshape inputs from NHWC to NCHW format.
         images = array_ops.transpose(
-            images, [0, len(shape) - 1] + list(range(1, len(shape) - 1)))
+            images, [0, len(shape) - 1] + list(range(1,
+                                                     len(shape) - 1)))
       output = _layers.batch_norm(
           images,
           is_training=is_training,
@@ -2745,16 +2756,16 @@ class BatchNormTest(test.TestCase):
     # Tests that the adjustment is appropriately passed to and used by the core
     # BN layer.
     all_adjustments = []
+
     def _create_adjustment(shape):
       adjustments = [array_ops.ones(shape[-1:]), array_ops.zeros(shape[-1:])]
       all_adjustments.extend(adjustments)
       return adjustments
+
     depth = 8
     images = array_ops.zeros([10, 5, 5, depth])
     output = _layers.batch_norm(
-        images,
-        is_training=True,
-        adjustment=_create_adjustment)
+        images, is_training=True, adjustment=_create_adjustment)
     self.assertListEqual(output.shape.as_list(), images.shape.as_list())
     self.assertEqual(len(all_adjustments), 2)
     self.assertListEqual(all_adjustments[0].shape.as_list(), [depth])
@@ -2819,7 +2830,10 @@ class LayerNormTest(test.TestCase):
       # output_train and output_eval should be the same.
       self.assertAllClose(sess.run([output_train]), sess.run([output_eval]))
 
-  def doOutputTest(self, input_shape, tol=1e-5, begin_norm_axis=1,
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   begin_norm_axis=1,
                    dtype=dtypes.float64):
     expected_mean = np.zeros(input_shape[:begin_norm_axis])
     expected_var = np.ones(input_shape[:begin_norm_axis])
@@ -2850,13 +2864,10 @@ class LayerNormTest(test.TestCase):
             # Layer-norm implemented in numpy
             eps = 1e-12
             expected_out = (
-                (gamma * (
-                    input_values
-                    - np.mean(input_values, axis=moments_axis, keepdims=True))
-                 / np.sqrt(
-                     eps
-                     + np.var(input_values, axis=moments_axis, keepdims=True)))
-                + beta)
+                (gamma * (input_values - np.mean(
+                    input_values, axis=moments_axis, keepdims=True)) /
+                 np.sqrt(eps + np.var(
+                     input_values, axis=moments_axis, keepdims=True))) + beta)
             self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
             self.assertAllClose(expected_var, var, atol=tol)
             # The full computation gets a bigger tolerance
@@ -2874,10 +2885,10 @@ class LayerNormTest(test.TestCase):
 
   def testOutput4DInputNormOnInnermostAxis(self):
     # Equivalent tests
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=3, tol=1e-4,
-                      dtype=dtypes.float64)
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4,
-                      dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=3, tol=1e-4, dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4, dtype=dtypes.float64)
 
   def testOutputSmallInput(self):
     self.doOutputTest((10, 10, 10, 30))
@@ -2914,7 +2925,7 @@ class GDNTest(test.TestCase):
       x = np.random.uniform(size=(1, 2, 3, 4)[:ndim])
       y = self._runGDN(x, x.shape, False, 'channels_last')
       self.assertEqual(x.shape, y.shape)
-      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testChannelsFirst(self):
     # `bias_add` doesn't support NCHW on CPU.
@@ -2923,8 +2934,7 @@ class GDNTest(test.TestCase):
         x = np.random.uniform(size=(4, 3, 2, 1)[:ndim])
         y = self._runGDN(x, x.shape, False, 'channels_first')
         self.assertEqual(x.shape, y.shape)
-        self.assertAllClose(
-            y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+        self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testWrongDims(self):
     for ndim in [1, 2, 6]:
@@ -2936,7 +2946,7 @@ class GDNTest(test.TestCase):
     x = np.random.uniform(size=(1, 2, 3, 4))
     y = self._runGDN(x, x.shape, True, 'channels_last')
     self.assertEqual(x.shape, y.shape)
-    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
 
 class MaxPool2DTest(test.TestCase):
@@ -3013,20 +3023,22 @@ class MaxPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateMaxPool(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32)
+    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3])
     self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
 
   def testCreateMaxPoolNCDHW(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32)
+    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW')
     self.assertEquals(output.op.name, 'MaxPool3D/transpose_1')
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4])
@@ -3034,7 +3046,8 @@ class MaxPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.max_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['MaxPool3D'])
     self.assertEqual(output_collected, output)
@@ -3069,7 +3082,8 @@ class MaxPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalMaxPool(self):
     depth, height, width = 3, 6, 9
@@ -3481,8 +3495,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[3] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[3] * 2))
 
   def testSpatialSoftmaxShapeNCHW(self):
     batch_shape = (2, 2, 35, 35)
@@ -3493,8 +3506,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[1] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[1] * 2))
 
   def testTwoMaxActivationsSameChannel(self):
     batch_size, height, width, nchannels = (2, 35, 35, 1)
@@ -3513,8 +3525,8 @@ class SpatialSoftmaxTests(test.TestCase):
     x_loc = [avg_x]
     y_loc = [avg_y]
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3532,13 +3544,13 @@ class SpatialSoftmaxTests(test.TestCase):
     spatial_softmax = _layers.spatial_softmax(features)
     np_features = np.zeros(batch_shape, dtype=np.float32)
 
-    edges = [(0, 0), (0, width-1), (height-1, 0), (height-1, width-1)]
+    edges = [(0, 0), (0, width - 1), (height - 1, 0), (height - 1, width - 1)]
     x_loc, y_loc = zip(*edges)
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3567,10 +3579,10 @@ class SpatialSoftmaxTests(test.TestCase):
       np_features1[:, x_loc[c], y_loc[c], c] = 100.
       np_features2[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints1 = self._SpatialSoftmax(
-        x_loc, y_loc, height1, width1, batch_size, nchannels)
-    np_keypoints2 = self._SpatialSoftmax(
-        x_loc, y_loc, height2, width2, batch_size, nchannels)
+    np_keypoints1 = self._SpatialSoftmax(x_loc, y_loc, height1, width1,
+                                         batch_size, nchannels)
+    np_keypoints2 = self._SpatialSoftmax(x_loc, y_loc, height2, width2,
+                                         batch_size, nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3596,8 +3608,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3619,8 +3631,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, c, x_loc[c], y_loc[c]] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3715,8 +3727,7 @@ class UnitNormTests(test.TestCase):
       image = random_ops.random_uniform((height, width, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       shape = [height, width, 3]
       del shape[dim]
@@ -3752,8 +3763,7 @@ class UnitNormTests(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, (None, None, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       with self.test_session():
         actual = norms.eval({image: placeholder_value})
@@ -3817,8 +3827,8 @@ class PoincareNormalizeTest(test.TestCase):
       with self.test_session():
         x_tf = constant_op.constant(x_np, name='x')
         y_tf = _layers.poincare_normalize(x_tf, dim)
-        err = gradient_checker.compute_gradient_error(x_tf, x_shape,
-                                                      y_tf, x_shape)
+        err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
+                                                      x_shape)
       print('PoinCareNormalize gradient err = %g ' % err)
       self.assertLess(err, 1e-4)
 
@@ -3830,14 +3840,9 @@ class LegacyFullyConnectedTest(test.TestCase):
     test.TestCase.setUp(self)
     random_seed.set_random_seed(1234)
     self.input = constant_op.constant([[1., 2., 3.], [-4., 15., -6.]])
-    self.input_3_dim_arr = [[[1., 1.1, 1.2],
-                             [2., 2.1, 2.2],
-                             [3., 3.1, 3.2],
-                             [4., 4.1, 4.2]],
-                            [[5., 5.1, 5.2],
-                             [6., 6.1, 6.2],
-                             [7., 7.1, 7.2],
-                             [8., 8.1, 8.2]]]
+    self.input_3_dim_arr = [[[1., 1.1, 1.2], [2., 2.1, 2.2], [3., 3.1, 3.2],
+                             [4., 4.1, 4.2]], [[5., 5.1, 5.2], [6., 6.1, 6.2],
+                                               [7., 7.1, 7.2], [8., 8.1, 8.2]]]
     self.input_3_dim = constant_op.constant(self.input_3_dim_arr)
 
     assert not ops.get_collection(ops.GraphKeys.SUMMARIES)
@@ -3932,15 +3937,10 @@ class LegacyFullyConnectedTest(test.TestCase):
     self._custom_initializers(self.input, 2, [[13.0, 13.0], [11.0, 11.0]])
 
   def test_custom_initializers_multi_dim(self):
-    self._custom_initializers(self.input_3_dim, 2,
-                              [[[7.6, 7.6],
-                                [13.6, 13.6],
-                                [19.6, 19.6],
-                                [25.6, 25.6]],
-                               [[31.6, 31.6],
-                                [37.6, 37.6],
-                                [43.6, 43.6],
-                                [49.6, 49.6]]])
+    self._custom_initializers(
+        self.input_3_dim, 2,
+        [[[7.6, 7.6], [13.6, 13.6], [19.6, 19.6], [25.6, 25.6]],
+         [[31.6, 31.6], [37.6, 37.6], [43.6, 43.6], [49.6, 49.6]]])
 
   def test_custom_collections(self):
     layers_lib.legacy_relu(
@@ -4050,12 +4050,16 @@ class LegacyFullyConnectedTest(test.TestCase):
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
       # we can feed in input with first dimension 2
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: self.input_3_dim_arr})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: self.input_3_dim_arr
+          })
       self.assertAllClose(shape_value, [2, 4, 1])
       # we can feed in input with first dimension 1
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: [self.input_3_dim_arr[0]]})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: [self.input_3_dim_arr[0]]
+          })
       self.assertAllClose(shape_value, [1, 4, 1])
       # we cannot feed in input with inconsistent dimensions
       with self.assertRaises(ValueError):
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 71978d43944..18bf16e246b 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Base utilities for loading datasets."""
 
 from __future__ import absolute_import
@@ -100,9 +99,7 @@ def load_iris(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'iris.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.int,
-      features_dtype=np.float)
+      data_path, target_dtype=np.int, features_dtype=np.float)
 
 
 def load_boston(data_path=None):
@@ -118,16 +115,10 @@ def load_boston(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'boston_house_prices.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.float,
-      features_dtype=np.float)
+      data_path, target_dtype=np.float, features_dtype=np.float)
 
 
-def retry(initial_delay,
-          max_delay,
-          factor=2.0,
-          jitter=0.25,
-          is_retriable=None):
+def retry(initial_delay, max_delay, factor=2.0, jitter=0.25, is_retriable=None):
   """Simple decorator for wrapping retriable functions.
 
   Args:
@@ -152,7 +143,7 @@ def retry(initial_delay,
   def delays():
     delay = initial_delay
     while delay <= max_delay:
-      yield delay * random.uniform(1 - jitter,  1 + jitter)
+      yield delay * random.uniform(1 - jitter, 1 + jitter)
       delay *= factor
 
   def wrap(fn):
@@ -172,7 +163,9 @@ def retry(initial_delay,
           else:
             raise
       return fn(*args, **kwargs)
+
     return wrapped_fn
+
   return wrap
 
 
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 1f3295747e1..37f9175015a 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Functions for downloading and reading MNIST data."""
 
 from __future__ import absolute_import
@@ -123,8 +122,8 @@ class DataSet(object):
     numpy.random.seed(seed1 if seed is None else seed2)
     dtype = dtypes.as_dtype(dtype).base_dtype
     if dtype not in (dtypes.uint8, dtypes.float32):
-      raise TypeError('Invalid image dtype %r, expected uint8 or float32' %
-                      dtype)
+      raise TypeError(
+          'Invalid image dtype %r, expected uint8 or float32' % dtype)
     if fake_data:
       self._num_examples = 10000
       self.one_hot = one_hot
@@ -202,7 +201,9 @@ class DataSet(object):
       end = self._index_in_epoch
       images_new_part = self._images[start:end]
       labels_new_part = self._labels[start:end]
-      return numpy.concatenate((images_rest_part, images_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
+      return numpy.concatenate(
+          (images_rest_part, images_new_part), axis=0), numpy.concatenate(
+              (labels_rest_part, labels_new_part), axis=0)
     else:
       self._index_in_epoch += batch_size
       end = self._index_in_epoch
@@ -257,16 +258,14 @@ def read_data_sets(train_dir,
     test_labels = extract_labels(f, one_hot=one_hot)
 
   if not 0 <= validation_size <= len(train_images):
-    raise ValueError(
-        'Validation size should be between 0 and {}. Received: {}.'
-        .format(len(train_images), validation_size))
+    raise ValueError('Validation size should be between 0 and {}. Received: {}.'
+                     .format(len(train_images), validation_size))
 
   validation_images = train_images[:validation_size]
   validation_labels = train_labels[:validation_size]
   train_images = train_images[validation_size:]
   train_labels = train_labels[validation_size:]
 
-
   options = dict(dtype=dtype, reshape=reshape, seed=seed)
 
   train = DataSet(train_images, train_labels, **options)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
index 649996c49cc..9a843168c27 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
@@ -151,7 +151,7 @@ def spirals(n_samples=100,
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
   if extras > 0:
-    x_exrta, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
+    x_extra, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
                                     **kwargs)
     spir_x = np.append(spir_x, x_extra)
     spir_y = np.append(spir_y, y_extra)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
index 5340afab46e..5809995c8c7 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.platform import test
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn.datasets import synthetic
 
+
 class SyntheticTest(test.TestCase):
   """Test synthetic dataset generation"""
 
   def test_make_dataset(self):
     """Test if the synthetic routine wrapper complains about the name"""
-    self.assertRaises(ValueError, datasets.make_dataset, name='_non_existing_name')
+    self.assertRaises(
+        ValueError, datasets.make_dataset, name='_non_existing_name')
 
   def test_all_datasets_callable(self):
     """Test if all methods inside the `SYNTHETIC` are callable"""
@@ -52,9 +54,10 @@ class SyntheticTest(test.TestCase):
     """
     n_samples = 100
     n_classes = 2
-    circ = synthetic.circles(n_samples = n_samples, noise = None, n_classes = n_classes)
+    circ = synthetic.circles(
+        n_samples=n_samples, noise=None, n_classes=n_classes)
     self.assertIsInstance(circ, datasets.base.Dataset)
-    self.assertTupleEqual(circ.data.shape, (n_samples,2))
+    self.assertTupleEqual(circ.data.shape, (n_samples, 2))
     self.assertTupleEqual(circ.target.shape, (n_samples,))
     self.assertSetEqual(set(circ.target), set(range(n_classes)))
 
@@ -67,17 +70,24 @@ class SyntheticTest(test.TestCase):
     """
     seed = 42
     noise = 0.1
-    circ0 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
+    circ0 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
     np.testing.assert_array_equal(circ0.data, circ1.data)
     np.testing.assert_array_equal(circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed+1)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.target, circ1.target)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed + 1)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                      circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise/2., n_classes = 2, seed = seed)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise / 2., n_classes=2, seed=seed)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
 
   def test_spirals(self):
     """Test if the circles are generated correctly
@@ -89,13 +99,14 @@ class SyntheticTest(test.TestCase):
       - returned `target` shape is (n_samples,)
       - set of unique classes range is [0, n_classes)
     """
-    self.assertRaises(ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
+    self.assertRaises(
+        ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
     n_samples = 100
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir = synthetic.spirals(n_samples = n_samples, noise = None, mode = mode)
+      spir = synthetic.spirals(n_samples=n_samples, noise=None, mode=mode)
       self.assertIsInstance(spir, datasets.base.Dataset)
-      self.assertTupleEqual(spir.data.shape, (n_samples,2))
+      self.assertTupleEqual(spir.data.shape, (n_samples, 2))
       self.assertTupleEqual(spir.target.shape, (n_samples,))
       self.assertSetEqual(set(spir.target), set(range(2)))
 
@@ -110,18 +121,24 @@ class SyntheticTest(test.TestCase):
     noise = 0.1
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir0 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
+      spir0 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
       np.testing.assert_array_equal(spir0.data, spir1.data)
       np.testing.assert_array_equal(spir0.target, spir1.target)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed+1)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.target, spir1.target)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed + 1)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.target, spir1.target)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise/2., seed = seed)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise / 2., seed=seed)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
+
+  def test_spirals_synthetic(self):
+    synthetic.spirals(3)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
index 6b125534a42..b968aeed1b7 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -44,7 +44,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 
-
 NUM_EXAMPLES = 100
 N_CLASSES = 5  #  Cardinality of multiclass labels.
 LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
@@ -52,8 +51,10 @@ LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
 
 def _train_test_split(features_and_labels):
   features, labels = features_and_labels
-  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
-  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
+  train_set = (features[:int(len(features) / 2)],
+               labels[:int(len(features) / 2)])
+  test_set = (features[int(len(features) / 2):],
+              labels[int(len(features) / 2):])
   return train_set, test_set
 
 
@@ -86,17 +87,17 @@ class DebugClassifierTest(test.TestCase):
     (train_features, train_labels), (test_features,
                                      test_labels) = _train_test_split(
                                          [self.features, self.labels])
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictBinary(self):
@@ -105,34 +106,34 @@ class DebugClassifierTest(test.TestCase):
                                      test_labels) = _train_test_split(
                                          [self.features, self.binary_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictProba(self):
@@ -150,8 +151,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -173,17 +174,17 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
 
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
     class_distribution = np.zeros((1, 2))
     for label in train_labels:
@@ -194,8 +195,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -232,13 +233,12 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = test_data.prepare_iris_data_for_logistic_regression()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[100], dtype=dtypes.int32)
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=_input_fn, steps=5)
     scores = classifier.evaluate(input_fn=_input_fn, steps=1)
     self.assertIn('loss', scores)
@@ -342,8 +342,7 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = base.load_iris()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[150], dtype=dtypes.int32)
 
@@ -387,7 +386,9 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1], [0], [0], [0]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     classifier = debug.DebugClassifier(n_classes=2)
@@ -404,8 +405,7 @@ class DebugClassifierTest(test.TestCase):
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -414,8 +414,7 @@ class DebugClassifierTest(test.TestCase):
       # 4 rows, with different weights.
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[7.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -438,8 +437,7 @@ class DebugClassifierTest(test.TestCase):
       # than (y=Not(x)) due to the relative higher weight of the first row.
       labels = constant_op.constant([[1], [0], [0], [0]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[100.], [3.], [2.], [2.]])
       }
       return features, labels
@@ -448,8 +446,7 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows (y = x)
       labels = constant_op.constant([[1], [1], [1], [1]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -469,8 +466,7 @@ class DebugClassifierTest(test.TestCase):
       features = {
           'x':
               input_lib.limit_epochs(
-                  array_ops.ones(
-                      shape=[4, 1], dtype=dtypes.float32),
+                  array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
                   num_epochs=num_epochs),
       }
       return features, labels
@@ -578,12 +574,11 @@ class DebugClassifierTest(test.TestCase):
     language = feature_column.sparse_column_with_hash_bucket('language', 100)
     feature_columns = [
         feature_column.real_valued_column('age'),
-        feature_column.embedding_column(
-            language, dimension=1)
+        feature_column.embedding_column(language, dimension=1)
     ]
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=input_fn, steps=5)
 
     def default_input_fn(unused_estimator, examples):
@@ -614,8 +609,8 @@ class DebugRegressorTest(test.TestCase):
     classifier.fit(
         input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_scores(input_fn=_input_fn_builder(test_features,
-                                                                None))
+    pred = classifier.predict_scores(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
   def testExperimentIntegration(self):
@@ -698,7 +693,9 @@ class DebugRegressorTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The algorithm should learn (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     regressor = debug.DebugRegressor(
@@ -853,5 +850,6 @@ class DebugRegressorTest(test.TestCase):
     predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn))
     self.assertAllClose(predictions, predictions2)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 12f9bba531a..2bd57597c2e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -1224,7 +1224,7 @@ class DNNRegressorTest(test.TestCase):
       self, predictions, expected_shape):
     predictions_nparray = np.array(predictions)
     self.assertAllEqual(expected_shape, predictions_nparray.shape)
-    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.float))
+    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.floating))
 
   def testPredict_AsIterableFalse(self):
     """Tests predict method with as_iterable=False."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 8d59fe66d98..63d0f1e1d45 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -600,7 +600,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
               input_fn=None,
               batch_size=None,
               outputs=None,
-              as_iterable=True):
+              as_iterable=True,
+              iterate_batches=False):
     """Returns predictions for given features.
 
     Args:
@@ -616,6 +617,9 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
         for each example until inputs are exhausted. Note: The inputs must
         terminate if you want the iterable to terminate (e.g. be sure to pass
         num_epochs=1 if you are using something like read_batch_features).
+      iterate_batches: If True, yield the whole batch at once instead of
+        decomposing the batch into individual samples. Only relevant when
+        as_iterable is True.
 
     Returns:
       A numpy array of predicted classes or regression values if the
@@ -635,7 +639,8 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
         input_fn=input_fn,
         feed_fn=feed_fn,
         outputs=outputs,
-        as_iterable=as_iterable)
+        as_iterable=as_iterable,
+        iterate_batches=iterate_batches)
 
   def get_variable_value(self, name):
     """Returns value of the variable given by name.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
index 9d7c1a099aa..d4a46b41d0c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
@@ -41,7 +41,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import queue_runner_impl
 
-
 _BOSTON_INPUT_DIM = 13
 _IRIS_INPUT_DIM = 4
 
@@ -93,8 +92,8 @@ def boston_eval_fn():
       constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
   labels = array_ops.reshape(
       constant_op.constant(boston.target), [n_examples, 1])
-  return array_ops.concat([features, features], 0), array_ops.concat(
-      [labels, labels], 0)
+  return array_ops.concat([features, features],
+                          0), array_ops.concat([labels, labels], 0)
 
 
 def extract(data, key):
@@ -129,7 +128,10 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -139,7 +141,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -150,7 +155,10 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -173,7 +181,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     del est
     # Create another estimator object with the same output dir.
     est2 = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
@@ -182,7 +192,9 @@ class EstimatorInputTest(test.TestCase):
     scores2 = est2.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertAllClose(scores2['MSE'], scores['MSE'])
     predictions = np.array(list(est2.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions,
@@ -197,7 +209,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston.data)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(scores['MSE'], other_score)
@@ -213,7 +227,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(other_score, scores['MSE'])
@@ -228,14 +244,15 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=iris.data,
         y=iris.target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = est.predict(x=iris.data)
     predictions_class = est.predict(x=iris.data, outputs=['class'])['class']
     self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0])
     self.assertAllClose(predictions['class'], predictions_class)
-    self.assertAllClose(
-        predictions['class'], np.argmax(
-            predictions['prob'], axis=1))
+    self.assertAllClose(predictions['class'],
+                        np.argmax(predictions['prob'], axis=1))
     other_score = _sklearn.accuracy_score(iris.target, predictions['class'])
     self.assertAllClose(scores['accuracy'], other_score)
     self.assertTrue('global_step' in scores)
@@ -250,17 +267,18 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=iris_data,
         y=iris_target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = list(est.predict(x=iris_data))
     predictions_class = list(est.predict(x=iris_data, outputs=['class']))
     self.assertEqual(len(predictions), iris.target.shape[0])
     classes_batch = np.array([p['class'] for p in predictions])
     self.assertAllClose(classes_batch,
                         np.array([p['class'] for p in predictions_class]))
-    self.assertAllClose(
-        classes_batch,
-        np.argmax(
-            np.array([p['prob'] for p in predictions]), axis=1))
+    self.assertAllClose(classes_batch,
+                        np.argmax(
+                            np.array([p['prob'] for p in predictions]), axis=1))
     other_score = _sklearn.accuracy_score(iris.target, classes_batch)
     self.assertAllClose(other_score, scores['accuracy'])
     self.assertTrue('global_step' in scores)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 656d68b7688..ac2d10011e2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -57,7 +57,10 @@ def _logistic_regression_model_fn(features, labels, mode):
   predictions = math_ops.sigmoid(logits)
   loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
-      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return predictions, loss, train_op
 
 
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 66e15265171..8f6cd39864b 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Evaluable` interface."""
 
 from __future__ import absolute_import
@@ -59,9 +58,12 @@ class Evaluable(object):
     for which this evaluation was performed.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
-        containing the input samples for fitting the model. Can be iterator that returns
-        arrays of features or dictionary of array of features. If set, `input_fn` must
+      x: Matrix of shape [n_samples, n_features...] or dictionary of many
+        matrices
+        containing the input samples for fitting the model. Can be iterator that
+          returns
+        arrays of features or dictionary of array of features. If set,
+          `input_fn` must
         be `None`.
       y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
         label values (class labels in classification, real numbers in
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 9576ff21c24..bec976afd27 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Experiment class collecting information needed for a single training run."""
 
 from __future__ import absolute_import
@@ -43,7 +42,6 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-
 __all__ = ["Experiment"]
 
 
@@ -278,8 +276,7 @@ class Experiment(object):
     self._train_steps_per_iteration = train_steps_per_iteration
     if (self._train_steps_per_iteration is not None and
         not isinstance(self._train_steps_per_iteration, int)):
-      raise ValueError(
-          "`train_steps_per_iteration` must be an integer.")
+      raise ValueError("`train_steps_per_iteration` must be an integer.")
 
   @property
   def estimator(self):
@@ -359,9 +356,10 @@ class Experiment(object):
           config.cluster_spec and config.master):
         self._start_server()
     elif config.cluster_spec and config.master:
-      raise ValueError('For distributed runtime, Experiment class only works with'
-                       'tf.contrib.learn.RunConfig for now, but provided {}'
-                       .format(type(config)))
+      raise ValueError(
+          "For distributed runtime, Experiment class only works with"
+          "tf.contrib.learn.RunConfig for now, but provided {}".format(
+              type(config)))
 
     extra_hooks = []
     if delay_secs is None:
@@ -414,11 +412,12 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting eval.", delay_secs)
       time.sleep(delay_secs)
 
-    return self._call_evaluate(input_fn=self._eval_input_fn,
-                               steps=self._eval_steps,
-                               metrics=self._eval_metrics,
-                               name=(name or "one_pass"),
-                               hooks=self._eval_hooks)
+    return self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=self._eval_steps,
+        metrics=self._eval_metrics,
+        name=(name or "one_pass"),
+        hooks=self._eval_hooks)
 
   @deprecated(
       "2016-10-23",
@@ -499,15 +498,12 @@ class Experiment(object):
     previous_path = None
     eval_result = None
     last_warning_time = 0
-    while (not predicate_fn or
-           predicate_fn(
-               eval_result,
-               checkpoint_path=previous_path if eval_result else None)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=previous_path if eval_result else None)):
       # Exit if we have already reached number of steps to train.
       if self._has_training_stopped(eval_result):
         logging.info("Exiting continuous eval, global_step=%s >= "
-                     "train_step=%s",
-                     eval_result[ops.GraphKeys.GLOBAL_STEP],
+                     "train_step=%s", eval_result[ops.GraphKeys.GLOBAL_STEP],
                      self._train_steps)
         return
 
@@ -528,12 +524,13 @@ class Experiment(object):
           logging.warning(error_msg)
           last_warning_time = time.time()
       else:
-        eval_result = self._call_evaluate(input_fn=input_fn,
-                                          steps=self._eval_steps,
-                                          metrics=self._eval_metrics,
-                                          name=name,
-                                          checkpoint_path=latest_path,
-                                          hooks=self._eval_hooks)
+        eval_result = self._call_evaluate(
+            input_fn=input_fn,
+            steps=self._eval_steps,
+            metrics=self._eval_metrics,
+            name=name,
+            checkpoint_path=latest_path,
+            hooks=self._eval_hooks)
         # Ensure eval result is not None for next round of evaluation.
         if not eval_result:
           eval_result = {}
@@ -558,8 +555,8 @@ class Experiment(object):
       return False
 
     global_step = eval_result.get(ops.GraphKeys.GLOBAL_STEP)
-    return global_step and self._train_steps and (
-        global_step >= self._train_steps)
+    return global_step and self._train_steps and (global_step >=
+                                                  self._train_steps)
 
   def continuous_eval(self,
                       delay_secs=None,
@@ -678,8 +675,7 @@ class Experiment(object):
       return eval_result, export_results
 
   @experimental
-  def continuous_train_and_eval(self,
-                                continuous_eval_predicate_fn=None):
+  def continuous_train_and_eval(self, continuous_eval_predicate_fn=None):
     """Interleaves training and evaluation.
 
     The frequency of evaluation is controlled by the `train_steps_per_iteration`
@@ -752,10 +748,9 @@ class Experiment(object):
     elif self._train_steps is not None:
       train_steps_per_iteration = int(self._train_steps / 10)
 
-    while (not predicate_fn or
-           predicate_fn(
-               eval_result,
-               checkpoint_path=latest_checkpoint if eval_result else None)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=latest_checkpoint
+        if eval_result else None)):
 
       if self._has_training_stopped(eval_result):
         # Exits once max steps of training is satisfied.
@@ -785,8 +780,7 @@ class Experiment(object):
   def _maybe_export(self, eval_result, checkpoint_path=None):
     """Export the Estimator using export_fn, if defined."""
     export_dir_base = os.path.join(
-        compat.as_bytes(self._estimator.model_dir),
-        compat.as_bytes("export"))
+        compat.as_bytes(self._estimator.model_dir), compat.as_bytes("export"))
 
     export_results = []
     for strategy in self._export_strategies:
@@ -824,10 +818,11 @@ class Experiment(object):
         hooks=self._train_monitors,
         saving_listeners=self._saving_listeners)
 
-    eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
-                                      steps=1,
-                                      metrics=self._eval_metrics,
-                                      name="one_pass")
+    eval_result = self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=1,
+        metrics=self._eval_metrics,
+        name="one_pass")
     _ = self._maybe_export(eval_result)
 
     return eval_result
@@ -849,9 +844,14 @@ class Experiment(object):
     server.start()
     return server
 
-  def _call_train(self, _sentinel=None,  # pylint: disable=invalid-name,
-                  input_fn=None, steps=None, hooks=None, max_steps=None,
-                  saving_listeners=None):
+  def _call_train(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      hooks=None,
+      max_steps=None,
+      saving_listeners=None):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
@@ -867,14 +867,18 @@ class Experiment(object):
           hooks=hooks,
           saving_listeners=saving_listeners)
     else:
-      return self._estimator.fit(input_fn=input_fn,
-                                 steps=steps,
-                                 max_steps=max_steps,
-                                 monitors=hooks)
+      return self._estimator.fit(
+          input_fn=input_fn, steps=steps, max_steps=max_steps, monitors=hooks)
 
-  def _call_evaluate(self, _sentinel=None,  # pylint: disable=invalid-name,
-                     input_fn=None, steps=None, metrics=None, name=None,
-                     checkpoint_path=None, hooks=None):
+  def _call_evaluate(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      metrics=None,
+      name=None,
+      checkpoint_path=None,
+      hooks=None):
     if _sentinel is not None:
       raise ValueError("_call_evaluate should be called with keyword args only")
 
@@ -882,18 +886,20 @@ class Experiment(object):
       if metrics is not None:
         raise ValueError(
             "`eval_metrics` must be `None` with `tf.estimator.Estimator`")
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
     else:
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      metrics=metrics,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          metrics=metrics,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index f36a778b529..96be8b1bc40 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -35,6 +35,7 @@ from tensorflow.python.platform import tf_logging as logging
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
+
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
@@ -74,11 +75,11 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   if not y_is_dict:
     output_shape = out_el_shape(y_shape, n_classes)
   else:
-    output_shape = dict([
-        (k, out_el_shape(v, n_classes[k]
-                         if n_classes is not None and k in n_classes else None))
-        for k, v in list(y_shape.items())
-    ])
+    output_shape = dict([(k,
+                          out_el_shape(v, n_classes[k]
+                                       if n_classes is not None and
+                                       k in n_classes else None))
+                         for k, v in list(y_shape.items())])
 
   return input_shape, output_shape, batch_size
 
@@ -314,23 +315,23 @@ class DataFeeder(object):
       input_dtype: DType of input (or dictionary of shapes).
       output_dtype: DType of output (or dictionary of shapes.
     """
-    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(
-        y, dict)
+    x_is_dict, y_is_dict = isinstance(
+        x, dict), y is not None and isinstance(y, dict)
     if isinstance(y, list):
       y = np.array(y)
 
     self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())
                    ]) if x_is_dict else check_array(x, x.dtype)
-    self._y = None if y is None else (
-        dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())])
-        if y_is_dict else check_array(y, y.dtype))
+    self._y = None if y is None else (dict(
+        [(k, check_array(v, v.dtype)) for k, v in list(y.items())])
+                                      if y_is_dict else check_array(y, y.dtype))
 
     # self.n_classes is not None means we're converting raw target indices
     # to one-hot.
     if n_classes is not None:
       if not y_is_dict:
-        y_dtype = (np.int64
-                   if n_classes is not None and n_classes > 1 else np.float32)
+        y_dtype = (
+            np.int64 if n_classes is not None and n_classes > 1 else np.float32)
         self._y = (None if y is None else check_array(y, dtype=y_dtype))
 
     self.n_classes = n_classes
@@ -352,8 +353,8 @@ class DataFeeder(object):
     # self._output_dtype == np.float32 when y is None
     self._output_dtype = (
         dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())])
-        if y_is_dict else (
-            _check_dtype(self._y.dtype) if y is not None else np.float32))
+        if y_is_dict else (_check_dtype(self._y.dtype)
+                           if y is not None else np.float32))
 
     # self.n_classes is None means we're passing in raw target indices
     if n_classes is not None and y_is_dict:
@@ -478,8 +479,8 @@ class DataFeeder(object):
 
     # Assign input features from random indices.
     def extract(data, indices):
-      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) if
-              len(data.shape) == 1 else _access(data, indices))
+      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1))
+              if len(data.shape) == 1 else _access(data, indices))
 
     # assign labels from random indices
     def assign_label(data, shape, dtype, n_classes, indices):
@@ -511,16 +512,18 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      x_len = list(self._x.values())[0].shape[
-          0] if x_is_dict else self._x.shape[0]
+      x_len = list(
+          self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
       end = min(x_len, self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
       # adding input placeholder
       feed_dict.update(
           dict([(self._input_placeholder[k].name, extract(v, batch_indices))
-                for k, v in list(self._x.items())]) if x_is_dict else
-          {self._input_placeholder.name: extract(self._x, batch_indices)})
+                for k, v in list(self._x.items())]) if x_is_dict else {
+                    self._input_placeholder.name:
+                        extract(self._x, batch_indices)
+                })
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
@@ -545,7 +548,8 @@ class DataFeeder(object):
                   assign_label(v, shape, dtype, n_classes, batch_indices)
           })
       else:
-        shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes
+        shape, dtype, n_classes = (self.output_shape, self._output_dtype,
+                                   self.n_classes)
         feed_dict.update({
             self._output_placeholder.name:
                 assign_label(self._y, shape, dtype, n_classes, batch_indices)
@@ -621,8 +625,9 @@ class StreamingDataFeeder(DataFeeder):
     elif y is None:
       y_first_el_shape = None
     else:
-      y_first_el_shape = ([1] + list(y_first_el[0].shape if isinstance(
-          y_first_el, list) else y_first_el.shape))
+      y_first_el_shape = (
+          [1] + list(y_first_el[0].shape
+                     if isinstance(y_first_el, list) else y_first_el.shape))
 
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         x_first_el_shape, y_first_el_shape, n_classes, batch_size)
@@ -683,8 +688,8 @@ class StreamingDataFeeder(DataFeeder):
         if shape is None:
           return None
         elif isinstance(shape, dict):
-          return dict([(k, np.zeros(shape[k], dtype[k]))
-                       for k in list(shape.keys())])
+          return dict(
+              [(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())])
         else:
           return np.zeros(shape, dtype=dtype)
 
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 0948dee7e2f..51381a7427c 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -879,7 +879,7 @@ class GraphDump(BaseMonitor):
     this_output = self.data[step] if step in self.data else {}
     other_output = other_dump.data[step] if step in other_dump.data else {}
     for key in this_output:
-      if not isinstance(key, str) and not isinstance(key, unicode):
+      if not isinstance(key, six.string_types):
         continue
       if key not in other_output:
         raise ValueError("%s missing at step %s.", (key, step))
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 972fec026f2..429b6040be2 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Trainable` interface."""
 
 from __future__ import absolute_import
@@ -28,18 +27,31 @@ class Trainable(object):
   __metaclass__ = abc.ABCMeta
 
   @abc.abstractmethod
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
+  def fit(self,
+          x=None,
+          y=None,
+          input_fn=None,
+          steps=None,
+          batch_size=None,
+          monitors=None,
+          max_steps=None):
     """Trains a model given training data `x` predictions and `y` labels.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
-         Can be iterator that returns arrays of features or dictionary of arrays of features.
-         The training input samples for fitting the model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
-         Can be iterator that returns array of labels or dictionary of array of labels.
-         The training label values (class labels in classification, real numbers in regression).
-         If set, `input_fn` must be `None`. Note: For classification, label values must
+      x: Matrix of shape [n_samples, n_features...] or the dictionary of
+        Matrices.
+         Can be iterator that returns arrays of features or dictionary of arrays
+           of features.
+         The training input samples for fitting the model. If set, `input_fn`
+           must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the
+        dictionary of same.
+         Can be iterator that returns array of labels or dictionary of array of
+           labels.
+         The training label values (class labels in classification, real numbers
+           in regression).
+         If set, `input_fn` must be `None`. Note: For classification, label
+           values must
          be integers representing the class index (i.e. values from 0 to
          n_classes-1).
       input_fn: Input function returning a tuple of:
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index fe2f183ac97..cea3627ed56 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -126,6 +126,7 @@ py_library(
 py_test(
     name = "sdca_estimator_test",
     srcs = ["python/sdca_estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":sdca_estimator_py",
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 13350c5a438..cc0e20f75ee 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -53,6 +53,8 @@ cc_test(
     srcs = ["arena_planner_test.cc"],
     deps = [
         ":arena_planner",
+        "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:lib",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -167,6 +169,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
index c27c327abc6..e10611e6d43 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
 namespace {
@@ -464,9 +466,7 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // ::tflite::LogToStderr();
-  FLAGS_logtostderr = true;
-
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 8338fde8acb..a1037a525c3 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -116,25 +116,9 @@ typedef struct {
 } TfLiteAddParams;
 
 typedef struct {
-  // Number of spatial dimensions.
-  // For now only NHWC is supported, and the value should always be 2.
-  int num_spatial_dimensions;
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int block_shape[2];
-  int before_paddings[2];
-  int after_paddings[2];
 } TfLiteSpaceToBatchNDParams;
 
 typedef struct {
-  // Number of spatial dimensions.
-  // For now only NHWC is supported, and the value should always be 2.
-  int num_spatial_dimensions;
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int block_shape[2];
-  int before_crops[2];
-  int after_crops[2];
 } TfLiteBatchToSpaceNDParams;
 
 typedef struct {
@@ -204,17 +188,9 @@ typedef struct {
 } TfLiteGatherParams;
 
 typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int perm[8];
-  int num_dimensions;
 } TfLiteTransposeParams;
 
 typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int axis[8];
-  int num_axis_dimensions;
   bool keep_dims;
 } TfLiteMeanParams;
 
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
index 10f31bb6f17..d74e275f043 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -225,14 +225,8 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   assert(pixelBuffer != NULL);
 
   OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
-  int doReverseChannels;
-  if (kCVPixelFormatType_32ARGB == sourcePixelFormat) {
-    doReverseChannels = 1;
-  } else if (kCVPixelFormatType_32BGRA == sourcePixelFormat) {
-    doReverseChannels = 0;
-  } else {
-    assert(false);  // Unknown source format
-  }
+  assert(sourcePixelFormat == kCVPixelFormatType_32ARGB ||
+         sourcePixelFormat == kCVPixelFormatType_32BGRA);
 
   const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
   const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 476d85c0314..d216cdf69ba 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -42,7 +42,10 @@ cc_library(
         "bitmap_helpers_impl.h",
         "label_image.h",
     ],
-    deps = ["//tensorflow/contrib/lite:string"],
+    deps = [
+        "//tensorflow/contrib/lite:string",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
 )
 
 # TODO(ahentz): Test disabled as it has a memory leek from read_bmp
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
index 860e27e5ba9..471fda2ba46 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
@@ -26,15 +26,15 @@ uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
                   int* channels, Settings* s);
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s);
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s);
 
 // explicit instantiation
-template void downsize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int,
-                                int, int, Settings*);
-template void downsize<float>(float*, unsigned char*, int, int, int, int, int,
-                              int, Settings*);
+template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int,
+                              int, int, Settings*);
+template void resize<float>(float*, unsigned char*, int, int, int, int, int,
+                            int, Settings*);
 
 }  // namespace label_image
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
index 64a931082b0..33ea695dda8 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -16,30 +16,76 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
 #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H
 
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
 #include "tensorflow/contrib/lite/examples/label_image/label_image.h"
 
 namespace tflite {
 namespace label_image {
 
 template <class T>
-void downsize(T* out, uint8_t* in, int image_height, int image_width,
-              int image_channels, int wanted_height, int wanted_width,
-              int wanted_channels, Settings* s) {
-  for (int y = 0; y < wanted_height; ++y) {
-    const int in_y = (y * image_height) / wanted_height;
-    uint8_t* in_row = in + (in_y * image_width * image_channels);
-    T* out_row = out + (y * wanted_width * wanted_channels);
-    for (int x = 0; x < wanted_width; ++x) {
-      const int in_x = (x * image_width) / wanted_width;
-      uint8_t* in_pixel = in_row + (in_x * image_channels);
-      T* out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {
-        if (s->input_floating)
-          out_pixel[c] = (in_pixel[c] - s->input_mean) / s->input_std;
-        else
-          out_pixel[c] = in_pixel[c];
-      }
-    }
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s) {
+
+  int number_of_pixels = image_height * image_width * image_channels;
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+
+  int base_index = 0;
+
+  // two inputs: input and new_sizes
+  interpreter->AddTensors(2, &base_index);
+  // one output
+  interpreter->AddTensors(1, &base_index);
+  // set input and output tensors
+  interpreter->SetInputs({0, 1});
+  interpreter->SetOutputs({2});
+
+  // set paramters of tensors
+  TfLiteQuantizationParams quant;
+  interpreter->SetTensorParametersReadWrite(
+      0, kTfLiteFloat32, "input",
+      {1, image_height, image_width, image_channels}, quant);
+  interpreter->SetTensorParametersReadWrite(1, kTfLiteInt32, "new_size", {2},
+                                            quant);
+  interpreter->SetTensorParametersReadWrite(
+      2, kTfLiteFloat32, "output",
+      {1, wanted_height, wanted_width, wanted_channels}, quant);
+
+  ops::builtin::BuiltinOpResolver resolver;
+  TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR);
+  interpreter->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, nullptr,
+                                     resize_op, nullptr);
+
+  interpreter->AllocateTensors();
+
+  // fill input image
+  // in[] are integers, cannot do memcpy() directly
+  auto input = interpreter->typed_tensor<float>(0);
+  for (int i = 0; i < number_of_pixels; i++) {
+    input[i] = in[i];
+  }
+
+  // fill new_sizes
+  interpreter->typed_tensor<int>(1)[0] = wanted_height;
+  interpreter->typed_tensor<int>(1)[1] = wanted_width;
+
+  interpreter->Invoke();
+
+  auto output = interpreter->typed_tensor<float>(2);
+  auto output_number_of_pixels =
+      wanted_height * wanted_height * wanted_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    if (s->input_floating)
+      out[i] = (output[i] - s->input_mean) / s->input_std;
+    else
+      out[i] = (uint8_t)output[i];
   }
 }
 
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index d7f49ad8757..a78900122ef 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -151,14 +151,14 @@ void RunInference(Settings* s) {
   switch (interpreter->tensor(input)->type) {
     case kTfLiteFloat32:
       s->input_floating = true;
-      downsize<float>(interpreter->typed_tensor<float>(input), in,
-                      image_height, image_width, image_channels,
-                      wanted_height, wanted_width, wanted_channels, s);
+      resize<float>(interpreter->typed_tensor<float>(input), in,
+                    image_height, image_width, image_channels,
+                    wanted_height, wanted_width, wanted_channels, s);
       break;
     case kTfLiteUInt8:
-      downsize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
-                        image_height, image_width, image_channels,
-                        wanted_height, wanted_width, wanted_channels, s);
+      resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
+                      image_height, image_width, image_channels,
+                      wanted_height, wanted_width, wanted_channels, s);
       break;
     default:
       LOG(FATAL) << "cannot handle input type "
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 69a597dc5a2..a8db149eaae 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -36,6 +36,10 @@ constexpr const int kSlotsToReserve = 128;
 namespace tflite {
 
 // A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
 class InterpreterInfo : public GraphInfo {
  public:
   explicit InterpreterInfo(Interpreter* interpreter)
@@ -45,9 +49,12 @@ class InterpreterInfo : public GraphInfo {
   TfLiteTensor* tensor(size_t index) override {
     return interpreter_->tensor(index);
   }
-  size_t num_nodes() const override { return interpreter_->nodes_size(); }
+  size_t num_nodes() const override {
+    return interpreter_->execution_plan().size();
+  }
   const TfLiteNode& node(size_t index) const override {
-    return interpreter_->node_and_registration(index)->first;
+    int node_index = interpreter_->execution_plan()[index];
+    return interpreter_->node_and_registration(node_index)->first;
   }
   const std::vector<int>& inputs() const override {
     return interpreter_->inputs();
@@ -73,7 +80,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kSlotsToReserve);
   nodes_and_registration_.reserve(kSlotsToReserve);
-  next_node_to_prepare_ = 0;
+  next_execution_plan_index_to_prepare_ = 0;
   UseNNAPI(false);
 }
 
@@ -160,7 +167,7 @@ TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
 }  // namespace
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  next_node_to_prepare_ = 0;
+  next_execution_plan_index_to_prepare_ = 0;
   if (memory_planner_) {
     TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
   }
@@ -190,7 +197,8 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
       &context_,
       CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
 
-  if (node_index) *node_index = nodes_and_registration_.size();
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
   nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
   auto& node_and_reg = nodes_and_registration_.back();
   TfLiteNode& node = node_and_reg.first;
@@ -213,6 +221,7 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
   }
   node.builtin_data = builtin_data_deleter.release();
   node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
   return kTfLiteOk;
 }
 
@@ -240,16 +249,19 @@ bool HasDynamicTensor(const TfLiteContext& context,
   return false;
 }
 
-TfLiteStatus Interpreter::PrepareOpsStartingAt(int first_node,
-                                               int* last_node_prepared) {
-  for (int i = first_node; i < nodes_and_registration_.size(); i++) {
-    TfLiteNode& node = nodes_and_registration_[i].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+TfLiteStatus Interpreter::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
     if (OpPrepare(registration, &node) == kTfLiteError) {
       return kTfLiteError;
     }
 
-    *last_node_prepared = i;
+    *last_execution_plan_index_prepared = execution_plan_index;
 
     // Discontinue if the node has dynamic outputs. Note that we don't
     // stop for dynamic temporary tensors since they won't affect the
@@ -268,14 +280,14 @@ TfLiteStatus Interpreter::PrepareOpsAndTensors() {
     memory_planner_->PlanAllocations();
   }
 
-  int last_node_prepared = 0;
+  int last_exec_plan_index_prepared = 0;
 
-  TF_LITE_ENSURE_STATUS(
-      PrepareOpsStartingAt(next_node_to_prepare_, &last_node_prepared));
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
   TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_node_to_prepare_, last_node_prepared));
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
 
-  next_node_to_prepare_ = last_node_prepared + 1;
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
   return kTfLiteOk;
 }
 
@@ -291,7 +303,8 @@ TfLiteStatus Interpreter::Invoke() {
 
   TfLiteStatus status = kTfLiteOk;
   if (nnapi_delegate_) {
-    if (next_node_to_prepare_ == nodes_and_registration_.size()) {
+    TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
       TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
       return kTfLiteOk;
     } else {
@@ -311,13 +324,17 @@ TfLiteStatus Interpreter::Invoke() {
   // TODO(b/71913981): we should force recalculation in the presence of dynamic
   // tensors, because they may have new value which in turn may affect shapes
   // and allocations.
-  for (int i = 0; i < nodes_and_registration_.size(); i++) {
-    if (i == next_node_to_prepare_) {
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
       TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-      TF_LITE_ENSURE(&context_, next_node_to_prepare_ >= i);
+      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
+                                    execution_plan_index);
     }
-    TfLiteNode& node = nodes_and_registration_[i].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
     if (OpInvoke(registration, &node) == kTfLiteError) {
       status = kTfLiteError;
     }
@@ -421,6 +438,14 @@ TfLiteStatus Interpreter::SetTensorParametersReadWrite(
   return kTfLiteOk;
 }
 
+TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
 TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
                                            TfLiteIntArray* new_size) {
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 4f732769f9f..c822557d02c 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -108,7 +108,7 @@ class Interpreter {
 
   // Adds a node with the given parameters and returns the index of the new
   // node in `node_index` (optionally). Interpreter will take ownership of
-  // `builtin_data` and destroy it with `delete`. Ownership of 'init_data'
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
   // remains with the caller.
   TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
                                      const std::vector<int>& outputs,
@@ -166,12 +166,19 @@ class Interpreter {
   // Return the number of ops in the model.
   int nodes_size() const { return nodes_and_registration_.size(); }
 
+  // WARNING: Experimental interface, subject to change
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
   // Get a tensor data structure.
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
     if (tensor_index >= context_.tensors_size || tensor_index < 0)
-        return nullptr;
+      return nullptr;
     return &context_.tensors[tensor_index];
   }
 
@@ -279,7 +286,8 @@ class Interpreter {
   // dynamic tensors is found or all ops have been prepared. Fill
   // 'last_node_prepared' with the id of the op containing dynamic tensors, or
   // the last in the graph.
-  TfLiteStatus PrepareOpsStartingAt(int first_node, int* last_node_prepared);
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
 
   // Tensors needed by the interpreter. Use `AddTensors` to add more blank
   // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
@@ -354,7 +362,14 @@ class Interpreter {
   // node id, and execute the node to generate the output tensor before continue
   // to allocate successors. This process repeats until all nodes are executed.
   // NOTE: this relies on the order of nodes that is in topological order.
-  int next_node_to_prepare_;
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
 
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index edff2109430..cfda19d72cd 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -282,6 +283,51 @@ TEST(BasicInterpreter, NoOpInterpreter) {
   ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
 }
 
+TEST(BasicInterpreter, ResizingTensors) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(1), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({0}), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(
+                0, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()),
+            kTfLiteOk);
+
+  int t = interpreter.inputs()[0];
+  TfLiteTensor* tensor = interpreter.tensor(t);
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 3}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 6 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  tensor->data.f[5] = 0.123f;
+
+  // Changing from kTfLiteArenaRw to kTfLiteDynamic is quite complicate: we need
+  // to unset data.raw, otherwise Realloc will try to free that memory.
+  tensor->data.raw = nullptr;
+  tensor->allocation_type = kTfLiteDynamic;
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 4}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 8 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // TODO(ahentz): We shouldn't have to force reallocation, but
+  // ResizeInputTensor doesn't realloc dynamic tensors. Also note that
+  // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op.
+  TfLiteTensorRealloc(9 * sizeof(float), tensor);
+  tensor->data.f[7] = 0.123f;
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {2, 2, 4}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 16 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // TODO(ahentz): We shouldn't have to force reallocation, but
+  // ResizeInputTensor doesn't realloc dynamic tensors. Also note that
+  // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op.
+  TfLiteTensorRealloc(17 * sizeof(float), tensor);
+  tensor->data.f[15] = 0.123f;
+}
+
 TEST(BasicInterpreter, OneOpInterpreter) {
   Interpreter interpreter;
   ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
@@ -514,13 +560,138 @@ TEST(BasicInterpreter, TestCustomErrorReporter) {
   ASSERT_EQ(reporter.calls, 1);
 }
 
+// Test fixture that allows playing with execution plans. It creates a two
+// node graph that can be executed in either [0,1] order or [1,0] order.
+// The CopyOp records when it is invoked in the class member run_order_
+// so we can test whether the execution plan was honored.
+class TestExecutionPlan : public ::testing::Test {
+  // Encapsulates the node ids and provides them to a C primitive data type
+  // Allocatable with placement new, but never destructed, so make sure this
+  // doesn't own any heap allocated data. This is then is used as op local
+  // data to allow access to the test fixture data.
+  class CallReporting {
+   public:
+    CallReporting(int node_id, std::vector<int>* run_order)
+        : node_id_(node_id), run_order_(run_order) {}
+
+    void Record() { run_order_->push_back(node_id_); }
+
+   private:
+    // The node id for this particular node
+    int node_id_;
+    // A pointer to the global run-order
+    std::vector<int>* run_order_;
+  };
+
+  // Build a kernel registration for an op that copies its one input
+  // to an output
+  TfLiteRegistration CopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      // Set output size to input size
+      TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* tensor1 = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+      return context->ResizeTensor(context, tensor1, newSize);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      CallReporting* call_reporting =
+          reinterpret_cast<CallReporting*>(node->builtin_data);
+      // Copy input data to output data.
+      TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+      int num = a0->dims->data[0];
+      for (int i = 0; i < num; i++) {
+        a1->data.f[i] = a0->data.f[i];
+      }
+      call_reporting->Record();
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  // Adds a copy node going from tensor `input` to output tensor `output`.
+  // Note, input is used as the node_id. Inject run_order as op accessible
+  // data. Note: this is a little strange of a way to do this, but it is
+  // using op functionality to avoid static global variables.
+  void MakeCopyNode(int input, int output) {
+    // Ownership of call_reporting is taken by interpreter (malloc is used due
+    // to nodes being a C99 interface so free() is used).
+    TfLiteRegistration copy_op = CopyOpRegistration();
+    CallReporting* call_reporting_1 =
+        reinterpret_cast<CallReporting*>(malloc(sizeof(CallReporting)));
+    new (call_reporting_1) CallReporting(input, &run_order_);
+    ASSERT_EQ(interpreter_.AddNodeWithParameters(
+                  {0}, {2}, nullptr, 0,
+                  reinterpret_cast<void*>(call_reporting_1), &copy_op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  void SetUp() final {
+    // Add two inputs and two outputs that don't depend on each other
+    ASSERT_EQ(interpreter_.AddTensors(4), kTfLiteOk);
+    interpreter_.SetInputs({0, 1});
+    interpreter_.SetOutputs({2, 3});
+    TfLiteQuantizationParams quantized;
+    for (int tensor_index = 0; tensor_index < 4; tensor_index++) {
+      ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                    tensor_index, kTfLiteFloat32, "", {3}, quantized),
+                kTfLiteOk);
+    }
+
+    // Define two copy functions that also use the user_data to report that
+    // they were called.
+    // i.e. tensor[2] = copy(tensor[0]); tensor[3] = copy(tensor[1]);
+    // thus we can reorder the two nodes arbitrary and still satisfy dependency
+    // order.
+    MakeCopyNode(0, 2);
+    MakeCopyNode(1, 3);
+
+    ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+  }
+
+ protected:
+  Interpreter interpreter_;
+
+  // list of node_ids that were run
+  std::vector<int> run_order_;
+};
+
+TEST_F(TestExecutionPlan, DefaultExecutionPlan) {
+  // Check default order
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({0, 1}));
+}
+
+TEST_F(TestExecutionPlan, ReversedExecutionPlan) {
+  // Check reversed order
+  interpreter_.SetExecutionPlan({1, 0});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1, 0}));
+}
+
+TEST_F(TestExecutionPlan, SubsetExecutionPlan) {
+  // Check running only node index 1
+  interpreter_.SetExecutionPlan({1});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1}));
+}
+
+TEST_F(TestExecutionPlan, NullExecutionPlan) {
+  // Check nothing executed.
+  interpreter_.SetExecutionPlan({});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>());
+}
+
 }  // namespace
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-#ifdef OS_LINUX
-  FLAGS_logtostderr = true;
-#endif
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 4195e7553c4..a8ef0daede4 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -71,6 +71,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_util",
+    srcs = [
+        "kernel_util.cc",
+    ],
+    hdrs = [
+        "kernel_util.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite/kernels/internal:round",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_util_test",
+    size = "small",
+    srcs = ["kernel_util_test.cc"],
+    deps = [
+        ":kernel_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = [
@@ -78,6 +104,7 @@ cc_library(
         "add.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
+        "bidirectional_sequence_rnn.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
@@ -87,7 +114,6 @@ cc_library(
         "fully_connected.cc",
         "gather.cc",
         "hashtable_lookup.cc",
-        "kernel_util.cc",
         "l2norm.cc",
         "local_response_norm.cc",
         "lsh_projection.cc",
@@ -111,7 +137,6 @@ cc_library(
         "unidirectional_sequence_rnn.cc",
     ],
     hdrs = [
-        "kernel_util.h",
         "padding.h",
         "register.h",
     ],
@@ -125,11 +150,13 @@ cc_library(
     }),
     deps = [
         ":activation_functor",
+        ":kernel_util",
         ":op_macros",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:gemm_support",
+        "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
         "//tensorflow/contrib/lite/kernels/internal:optimized",
         "//tensorflow/contrib/lite/kernels/internal:optimized_base",
         "//tensorflow/contrib/lite/kernels/internal:quantization_util",
@@ -223,6 +250,7 @@ tf_cc_test(
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -263,6 +291,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["bidirectional_sequence_rnn_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 8ac93bc8c8d..3c5c77815d0 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -134,8 +134,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::max(0.f, *in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -173,8 +172,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -192,8 +190,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::tanh(*in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 0e10a249aba..63ea89df56b 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
-
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteAddParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+                  TfLiteAddParams* params, const OpData* data,
+                  TfLiteTensor* input1, TfLiteTensor* input2,
+                  TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_ADD(type)                                        \
-  type::Add(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
-    if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops);
+#define TF_LITE_ADD(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(reference_ops, BroadcastAdd);
     } else {
-      TF_LITE_ADD(optimized_ops);
+      TF_LITE_ADD(reference_ops, Add);
+    }
+  } else {
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(optimized_ops, BroadcastAdd);
+    } else {
+      TF_LITE_ADD(optimized_ops, Add);
+    }
   }
 #undef TF_LITE_ADD
 }
 
 template <KernelType kernel_type>
 void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLiteAddParams* params, TfLiteTensor* input1,
-                      TfLiteTensor* input2, TfLiteTensor* output) {
+                      TfLiteAddParams* params, const OpData* data,
+                      TfLiteTensor* input1, TfLiteTensor* input2,
+                      TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -112,19 +141,20 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_ADD(type)                                                   \
-  type::BroadcastAdd(                                                       \
-      left_shift, GetTensorData<uint8_t>(input1), GetTensorDims(input1),    \
-      input1_offset, input1_multiplier, input1_shift,                       \
-      GetTensorData<uint8_t>(input2), GetTensorDims(input2), input2_offset, \
-      input2_multiplier, input2_shift, output_offset, output_multiplier,    \
-      output_shift, output_activation_min, output_activation_max,           \
-      GetTensorData<uint8_t>(output), GetTensorDims(output));
-
+#define TF_LITE_ADD(type, opname)                                            \
+  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
+               GetTensorDims(input1), input1_offset, input1_multiplier,      \
+               input1_shift, GetTensorData<uint8_t>(input2),                 \
+               GetTensorDims(input2), input2_offset, input2_multiplier,      \
+               input2_shift, output_offset, output_multiplier, output_shift, \
+               output_activation_min, output_activation_max,                 \
+               GetTensorData<uint8_t>(output), GetTensorDims(output));
+  // The quantized version of Add doesn't support activations, so we
+  // always use BroadcastAdd.
   if (kernel_type == kReference) {
-    TF_LITE_ADD(reference_ops);
+    TF_LITE_ADD(reference_ops, BroadcastAdd);
   } else {
-    TF_LITE_ADD(optimized_ops);
+    TF_LITE_ADD(optimized_ops, BroadcastAdd);
   }
 #undef TF_LITE_ADD
 }
@@ -132,15 +162,17 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalAddFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalAddFloat<kernel_type>(context, node, params, data, input1, input2,
+                              output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalAddQuantized<kernel_type>(context, node, params, input1, input2,
+    EvalAddQuantized<kernel_type>(context, node, params, data, input1, input2,
                                   output);
   } else {
     context->ReportError(context,
@@ -154,19 +186,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace add
 
 TfLiteRegistration* Register_ADD_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc
index 306dfc3e803..956d05bed51 100644
--- a/tensorflow/contrib/lite/kernels/add_test.cc
+++ b/tensorflow/contrib/lite/kernels/add_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseAddOpModel : public SingleOpModel {
  public:
-  BaseAddOpModel(const TensorData& input, const TensorData& output,
+  BaseAddOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ float GetTolerance(int min, int max) {
 
 TEST(FloatAddOpModel, NoActivation) {
   FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -78,9 +80,9 @@ TEST(FloatAddOpModel, NoActivation) {
 }
 
 TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) {
-  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}},
-                    ActivationFunctionType_RELU_N1_TO_1);
+  FloatAddOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
   m.Invoke();
@@ -92,6 +94,7 @@ TEST(FloatAddOpModel, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -102,6 +105,23 @@ TEST(FloatAddOpModel, VariousInputShapes) {
   }
 }
 
+TEST(FloatAddOpModel, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::initializer_list<float>> inputs1 = {
@@ -112,6 +132,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
@@ -133,6 +154,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
                                                        {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
@@ -150,6 +172,7 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
                           {TensorType_UINT8, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
@@ -162,6 +185,25 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1},
+                                                kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index 3cee43c68b2..2c5074eca31 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -15,14 +15,15 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -76,8 +77,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
   output_size_array->data[0] = batch_size;
   output_size_array->data[1] = num_units;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
-                                                   output_size_array));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
 
   return kTfLiteOk;
 }
@@ -101,50 +102,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
-  const int input_weights_stride = input_weights->dims->data[1];
-  const int recurrent_weights_stride = recurrent_weights->dims->data[1];
 
-  // For each batch
-  for (int b = 0; b < batch_size; b++) {
-    // Initialize the pointer to input, output and bias.
-    const float* input_ptr_batch = input->data.f + b * input_size;
-    float* output_ptr_batch = output->data.f + b * num_units;
-    float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
-
-    // Initialize input_weights and recurrent_weights.
-    const float* input_weights_ptr = input_weights->data.f;
-    const float* recurrent_weights_ptr = recurrent_weights->data.f;
-
-    // Output = bias
-    for (int o = 0; o < num_units; o++) {
-      output_ptr_batch[o] = bias_ptr[o];
-    }
-
-    // Output += input * input_weights
-    for (int o = 0; o < num_units; o++) {
-      for (int i = 0; i < input_size; i++) {
-        output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
-      }
-      input_weights_ptr += input_weights_stride;
-    }
-
-    // Output += recurrent_weights * hidden_state
-    for (int o = 0; o < num_units; o++) {
-      for (int h = 0; h < num_units; h++) {
-        output_ptr_batch[o] +=
-            hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
-      }
-      recurrent_weights_ptr += recurrent_weights_stride;
-    }
-
-    // Output = activation(Output) and update hidden_state
-    for (int o = 0; o < num_units; o++) {
-      output_ptr_batch[o] =
-          (ActivationFunctor(params->activation))(output_ptr_batch[o]);
-      hidden_state_ptr_batch[o] = output_ptr_batch[o];
-    }
-  }
+  // Initialize the pointer to hidden state.
+  float* hidden_state_ptr_batch = hidden_state->data.f;
+  // Initialize the pointer to input and output.
+  const float* input_ptr_batch = input->data.f;
+  float* output_ptr_batch = output->data.f;
+  // Initialize input_weights and recurrent_weights.
+  const float* input_weights_ptr = input_weights->data.f;
+  const float* recurrent_weights_ptr = recurrent_weights->data.f;
 
+  kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
+                             recurrent_weights_ptr, bias_ptr, input_size,
+                             num_units, batch_size, params->activation,
+                             hidden_state_ptr_batch, output_ptr_batch);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
index 5ecccb985e9..fa7ef525db4 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -120,8 +120,7 @@ static float rnn_golden_output[] = {
 
     0.415153,   0.210318,   0,         0,         0,         0,
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
-    0.628881,   3.58099,    1.49974,   0
-};
+    0.628881,   3.58099,    1.49974,   0};
 
 class RNNOpModel : public SingleOpModel {
  public:
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index 0eed680fdcc..889239f9321 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -35,12 +35,14 @@ enum KernelType {
 
 struct BatchToSpaceNDContext {
   BatchToSpaceNDContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteBatchToSpaceNDParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    crops = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteBatchToSpaceNDParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* crops;
   TfLiteTensor* output;
 };
 
@@ -48,23 +50,28 @@ struct BatchToSpaceNDContext {
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(ycling): Support arbitrary dimension in BatchToSpaceND.
 const int kInputDimensionNum = 4;
-const int kOutputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
 const int kSpatialDimensionNum = 2;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  // The 2nd tensor (block_shape) and the 3rd tensor (crops) are ignored now.
-  TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                BatchToSpaceNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int* crops = GetTensorData<int32>(op_context->crops);
 
-  BatchToSpaceNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions,
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
                     kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  const int* block_shape = op_context.params->block_shape;
+  // TODO(ycling): Add crops as part of calculation. Remove check for a crops
+  // containing all zeroes.
+  TF_LITE_ENSURE_EQ(context, crops[0], 0);
+  TF_LITE_ENSURE_EQ(context, crops[1], 0);
+  TF_LITE_ENSURE_EQ(context, crops[2], 0);
+  TF_LITE_ENSURE_EQ(context, crops[3], 0);
 
   // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
   TF_LITE_ENSURE_EQ(context,
@@ -76,27 +83,48 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int output_width = input_size->data[2] * block_shape[1];
   const int output_channel_size = input_size->data[3];
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   output_size->data[0] = output_batch_size;
   output_size->data[1] = output_height;
   output_size->data[2] = output_width;
   output_size->data[3] = output_channel_size;
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  BatchToSpaceNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.crops)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   BatchToSpaceNDContext op_context(context, node);
 
-  int block_shape_dims_array[1] = {kSpatialDimensionNum};
-  Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1);
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                          \
-  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),          \
-                       GetTensorDims(op_context.input),                  \
-                       op_context.params->block_shape, block_shape_dims, \
-                       GetTensorData<scalar>(op_context.output),         \
+#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                        \
+  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
index 3ec4efbebce..8485cde1b40 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -26,36 +26,76 @@ using ::testing::ElementsAreArray;
 
 class BatchToSpaceNDOpModel : public SingleOpModel {
  public:
-  BatchToSpaceNDOpModel(std::initializer_list<int> input_shape,
-                        std::initializer_list<int> block_shape,
-                        std::initializer_list<int> before_crops,
-                        std::initializer_list<int> after_crops) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
-                 BuiltinOptions_BatchToSpaceNDOptions,
-                 CreateBatchToSpaceNDOptions(
-                     builder_, builder_.CreateVector<int>(block_shape),
-                     builder_.CreateVector<int>(before_crops),
-                     builder_.CreateVector<int>(after_crops))
-                     .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetCrops(std::initializer_list<int> data) {
+    PopulateTensor<int>(crops_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int block_shape_;
+  int crops_;
   int output_;
 };
 
-TEST(BatchToSpaceNDOpTest, SimpleTest) {
-  BatchToSpaceNDOpModel m({4, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0});
+// Tests case where block_shape and crops are const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpConstModel m(input_shape, block_shape, crops);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> crops) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and crops are non-const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(crops);
+//    m.Invoke();
+class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    crops_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
@@ -63,11 +103,35 @@ TEST(BatchToSpaceNDOpTest, SimpleTest) {
                                                4, 8, 11, 15, 12, 16}));
 }
 
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpModel({3, 2, 2, 1}, {2, 2}, {0, 0}, {0, 0}),
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
 }
 
+TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 1}),
+               "1 != 0");
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 1, 0});
+  EXPECT_DEATH(m.Invoke(), "1 != 0");
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
new file mode 100644
index 00000000000..aa24c1f34cd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -0,0 +1,205 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bidirectional_sequence_rnn {
+
+constexpr int kInputTensor = 0;
+// Forward and backward cell tensors.
+constexpr int kFwWeightsTensor = 1;
+constexpr int kFwRecurrentWeightsTensor = 2;
+constexpr int kFwBiasTensor = 3;
+constexpr int kBwWeightsTensor = 4;
+constexpr int kBwRecurrentWeightsTensor = 5;
+constexpr int kBwBiasTensor = 6;
+// State and output tensors.
+constexpr int kFwHiddenStateTensor = 0;
+constexpr int kFwOutputTensor = 1;
+constexpr int kBwHiddenStateTensor = 2;
+constexpr int kBwOutputTensor = 3;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 7);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  TF_LITE_ASSERT_EQ(input->dims->data[2], fw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(input->dims->data[2], bw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(fw_input_weights->dims->data[0], fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_input_weights->dims->data[0], bw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(fw_recurrent_weights->dims->data[0],
+                    fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1],
+                    bw_bias->dims->data[0]);
+
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  // Resize hidden states.
+  TfLiteIntArray* fw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  fw_hidden_state_size_array->data[0] = batch_size;
+  fw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_hidden_state,
+                                                   fw_hidden_state_size_array));
+
+  TfLiteIntArray* bw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  bw_hidden_state_size_array->data[0] = batch_size;
+  bw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_hidden_state,
+                                                   bw_hidden_state_size_array));
+
+  // Mark hidden states as a persistent tensor.
+  fw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+  bw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize outputs.
+  TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
+  fw_output_size_array->data[0] = batch_size;
+  fw_output_size_array->data[1] = max_time;
+  fw_output_size_array->data[2] = fw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, fw_output, fw_output_size_array));
+  TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
+  bw_output_size_array->data[0] = batch_size;
+  bw_output_size_array->data[1] = max_time;
+  bw_output_size_array->data[2] = bw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, bw_output, bw_output_size_array));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int input_size = input->dims->data[2];
+
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const float* fw_bias_ptr = fw_bias->data.f;
+  const float* fw_input_weights_ptr = fw_input_weights->data.f;
+  const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
+
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  const float* bw_bias_ptr = bw_bias->data.f;
+  const float* bw_input_weights_ptr = bw_input_weights->data.f;
+  const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
+
+  for (int b = 0; b < batch_size; b++) {
+    // Forward cell.
+    float* fw_hidden_state_ptr_batch =
+        fw_hidden_state->data.f + b * fw_num_units;
+    for (int s = 0; s < max_time; s++) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr,
+          fw_bias_ptr, input_size, fw_num_units, /*batch_size=*/1,
+          params->activation, fw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+    // Backward cell.
+    float* bw_hidden_state_ptr_batch =
+        bw_hidden_state->data.f + b * bw_num_units;
+    for (int s = max_time - 1; s >= 0; s--) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr,
+          bw_bias_ptr, input_size, bw_num_units, /*batch_size=*/1,
+          params->activation, bw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace bidirectional_sequence_rnn
+
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 bidirectional_sequence_rnn::Prepare,
+                                 bidirectional_sequence_rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
new file mode 100644
index 00000000000..12f4ff97cfd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -0,0 +1,931 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Bidirectional RNN op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+    0.23689353,   0.285385,     0.037029743, -0.19858193,  -0.27569133,
+    0.43773448,   0.60379338,   0.35562468,  -0.69424844,  -0.93421471,
+    -0.87287879,  0.37144363,   -0.62476718, 0.23791671,   0.40060222,
+    0.1356622,    -0.99774903,  -0.98858172, -0.38952237,  -0.47685933,
+    0.31073618,   0.71511042,   -0.63767755, -0.31729108,  0.33468103,
+    0.75801885,   0.30660987,   -0.37354088, 0.77002847,   -0.62747043,
+    -0.68572164,  0.0069220066, 0.65791464,  0.35130811,   0.80834007,
+    -0.61777675,  -0.21095741,  0.41213346,  0.73784804,   0.094794154,
+    0.47791874,   0.86496925,   -0.53376222, 0.85315156,   0.10288584,
+    0.86684,      -0.011186242, 0.10513687,  0.87825835,   0.59929144,
+    0.62827742,   0.18899453,   0.31440187,  0.99059987,   0.87170351,
+    -0.35091716,  0.74861872,   0.17831337,  0.2755419,    0.51864719,
+    0.55084288,   0.58982027,   -0.47443086, 0.20875752,   -0.058871567,
+    -0.66609079,  0.59098077,   0.73017097,  0.74604273,   0.32882881,
+    -0.17503482,  0.22396147,   0.19379807,  0.29120302,   0.077113032,
+    -0.70331609,  0.15804303,   -0.93407321, 0.40182066,   0.036301374,
+    0.66521823,   0.0300982,    -0.7747041,  -0.02038002,  0.020698071,
+    -0.90300065,  0.62870288,   -0.23068321, 0.27531278,   -0.095755219,
+    -0.712036,    -0.17384434,  -0.50593495, -0.18646687,  -0.96508682,
+    0.43519354,   0.14744234,   0.62589407,  0.1653645,    -0.10651493,
+    -0.045277178, 0.99032974,   -0.88255352, -0.85147917,  0.28153265,
+    0.19455957,   -0.55479527,  -0.56042433, 0.26048636,   0.84702539,
+    0.47587705,   -0.074295521, -0.12287641, 0.70117295,   0.90532446,
+    0.89782166,   0.79817224,   0.53402734,  -0.33286154,  0.073485017,
+    -0.56172788,  -0.044897556, 0.89964068,  -0.067662835, 0.76863563,
+    0.93455386,   -0.6324693,   -0.083922029};
+
+static float rnn_golden_fw_output[] = {
+    0.496726,   0,          0.965996,  0,         0.0584254, 0,
+    0,          0.12315,    0,         0,         0.612266,  0.456601,
+    0,          0.52286,    1.16099,   0.0291232,
+
+    0,          0,          0.524901,  0,         0,         0,
+    0,          1.02116,    0,         1.35762,   0,         0.356909,
+    0.436415,   0.0355727,  0,         0,
+
+    0,          0,          0,         0.262335,  0,         0,
+    0,          1.33992,    0,         2.9739,    0,         0,
+    1.31914,    2.66147,    0,         0,
+
+    0.942568,   0,          0,         0,         0.025507,  0,
+    0,          0,          0.321429,  0.569141,  1.25274,   1.57719,
+    0.8158,     1.21805,    0.586239,  0.25427,
+
+    1.04436,    0,          0.630725,  0,         0.133801,  0.210693,
+    0.363026,   0,          0.533426,  0,         1.25926,   0.722707,
+    0,          1.22031,    1.30117,   0.495867,
+
+    0.222187,   0,          0.72725,   0,         0.767003,  0,
+    0,          0.147835,   0,         0,         0,         0.608758,
+    0.469394,   0.00720298, 0.927537,  0,
+
+    0.856974,   0.424257,   0,         0,         0.937329,  0,
+    0,          0,          0.476425,  0,         0.566017,  0.418462,
+    0.141911,   0.996214,   1.13063,   0,
+
+    0.967899,   0,          0,         0,         0.0831304, 0,
+    0,          1.00378,    0,         0,         0,         1.44818,
+    1.01768,    0.943891,   0.502745,  0,
+
+    0.940135,   0,          0,         0,         0,         0,
+    0,          2.13243,    0,         0.71208,   0.123918,  1.53907,
+    1.30225,    1.59644,    0.70222,   0,
+
+    0.804329,   0,          0.430576,  0,         0.505872,  0.509603,
+    0.343448,   0,          0.107756,  0.614544,  1.44549,   1.52311,
+    0.0454298,  0.300267,   0.562784,  0.395095,
+
+    0.228154,   0,          0.675323,  0,         1.70536,   0.766217,
+    0,          0,          0,         0.735363,  0.0759267, 1.91017,
+    0.941888,   0,          0,         0,
+
+    0,          0,          1.5909,    0,         0,         0,
+    0,          0.5755,     0,         0.184687,  0,         1.56296,
+    0.625285,   0,          0,         0,
+
+    0,          0,          0.0857888, 0,         0,         0,
+    0,          0.488383,   0.252786,  0,         0,         0,
+    1.02817,    1.85665,    0,         0,
+
+    0.00981836, 0,          1.06371,   0,         0,         0,
+    0,          0,          0,         0.290445,  0.316406,  0,
+    0.304161,   1.25079,    0.0707152, 0,
+
+    0.986264,   0.309201,   0,         0,         0,         0,
+    0,          1.64896,    0.346248,  0,         0.918175,  0.78884,
+    0.524981,   1.92076,    2.07013,   0.333244,
+
+    0.415153,   0.210318,   0,         0,         0,         0,
+    0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
+    0.628881,   3.58099,    1.49974,   0};
+
+static float rnn_golden_bw_output[] = {
+    0.496726, 0,          1.00883,   0,         0.0584256, 0,         0,
+    0.236412, 0,          0,         0.612267,  0.487726,  0,         0.54883,
+    1.16099,  0.0291233,  0,         0,         0.428302,  0,         0,
+    0,        0,          1.13262,   0,         1.64415,   0,         0.311249,
+    0.570804, 0.259696,   0,         0,         0,         0,         0,
+    0.262334, 0,          0,         0,         1.23781,   0,         2.86532,
+    0,        0,          1.34389,   2.76409,   0,         0,         1.03969,
+    0,        0.00410865, 0,         0.0470295, 0,         0,         0,
+    0.371556, 0.27175,    1.36614,   1.63956,   0.683887,  1.06176,   0.719552,
+    0.301314, 0.971195,   0,         0.697143,  0,         0.215219,  0.210693,
+    0.363027, 0,          0.501283,  0,         1.13399,   0.623774,  0,
+    1.09851,  1.33313,    0.470441,  0.210965,  0,         0.664178,  0,
+    0.839686, 0,          0,         0.147834,  0,         0,         0,
+    0.58786,  0.490128,   0,         0.905806,  0,         0.932134,  0.424257,
+    0,        0,          0.860629,  0,         0,         0,         0.476425,
+    0,        0.566017,   0.513721,  0.207341,  1.09508,   1.08385,   0,
+    0.973787, 0,          0,         0,         0,         0,         0,
+    1.20698,  0,          0,         0,         1.56135,   1.12369,   0.99588,
+    0.459803, 0,          0.915854,  0,         0,         0,         0,
+    0,        0,          2.03206,   0,         0.773264,  0.267228,  1.55012,
+    1.202,    1.51611,    0.701202,  0,         0.725088,  0,         0.509069,
+    0,        0.671349,   0.581129,  0.343447,  0,         0.107755,  0.611838,
+    1.4331,   1.55871,    0.015242,  0.140624,  0.492562,  0.395095,  0.147722,
+    0,        0.784925,   0,         1.65477,   0.715257,  0,         0,
+    0,        0.685024,   0,         1.89505,   1.00037,   0,         0,
+    0,        0,          0,         1.52659,   0,         0,         0,
+    0,        0.618583,   0,         0.11115,   0,         1.37194,   0.630225,
+    0,        0,          0,         0,         0,         0.0322124, 0,
+    0,        0,          0,         0.430834,  0.252786,  0,         0,
+    0,        0.991297,   1.98451,   0,         0,         0.111511,  0,
+    1.05513,  0,          0,         0,         0,         0,         0,
+    0.290445, 0.412559,   0.0429958, 0.256564,  1.27858,   0.289948,  0,
+    1.01693,  0.327141,   0,         0,         0,         0,         0,
+    1.83508,  0.346248,   0,         0.961535,  0.790026,  0.552203,  2.13457,
+    2.19233,  0.333244,   0.316526,  0.179398,  0,         0,         0,
+    0,        0,          1.86126,   0,         0.728256,  0.750013,  0.011861,
+    0.576383, 3.38891,    1.29273,   0};
+
+constexpr std::initializer_list<float> weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static float endtoend_input[] = {
+    0.996808, 0.060710, 0.981855, 0.570017, 0.525164, 0.796859, 0.696547,
+    0.505925, 0.991844, 0.461208, 0.949371, 0.027624, 0.539236, 0.841854,
+    0.915222, 0.538569, 0.069375, 0.237905, 0.903700, 0.441703, 0.536196,
+    0.402724, 0.761635, 0.025063, 0.082592, 0.688245, 0.239310, 0.256931,
+    0.658900, 0.105695, 0.301983, 0.655708, 0.166405, 0.283837, 0.225725,
+    0.691569, 0.080696, 0.922272, 0.197494, 0.072540, 0.383481, 0.146865,
+    0.100163, 0.922717, 0.988720, 0.015386, 0.461286, 0.058095, 0.253290,
+    0.364986, 0.499797, 0.789487, 0.767709, 0.261433, 0.814549, 0.850302,
+    0.949678, 0.053859, 0.107233, 0.608577, 0.159554, 0.409215, 0.264285,
+    0.325960, 0.693053, 0.490011, 0.017529, 0.773749, 0.412283, 0.215023,
+    0.846288, 0.795764, 0.361889, 0.946452, 0.718481, 0.350608, 0.961837,
+    0.179767, 0.408703, 0.215128, 0.544753, 0.908500, 0.004614, 0.312462,
+    0.169933, 0.819163, 0.162764, 0.119611, 0.873022, 0.269997, 0.728188,
+    0.032576, 0.679212, 0.992474, 0.358536, 0.372265, 0.482484, 0.376065,
+    0.146014, 0.894767, 0.591088, 0.992302, 0.690531, 0.952977, 0.938754,
+    0.409012, 0.303585, 0.900591, 0.588780, 0.712287, 0.115719, 0.133533,
+    0.620788, 0.120334, 0.445995, 0.790720, 0.939497, 0.608759, 0.910331,
+    0.812519, 0.878756, 0.638519, 0.845096, 0.557968, 0.630993, 0.203632,
+    0.930233, 0.113477, 0.579697, 0.076247, 0.008244, 0.170785, 0.068549,
+    0.698776, 0.123761, 0.007303, 0.107788, 0.427346, 0.907894, 0.696568,
+    0.139633, 0.023613, 0.830100, 0.760421, 0.143947, 0.276096, 0.551141,
+    0.083444, 0.884855, 0.461472, 0.895963, 0.763611, 0.099992, 0.741059,
+    0.321579, 0.730984, 0.944691, 0.251812, 0.844461, 0.524388, 0.328059,
+    0.852706, 0.695172, 0.396607, 0.551482, 0.818934, 0.403910, 0.659270,
+    0.246280, 0.311804, 0.355838, 0.385913, 0.335418, 0.185938, 0.146334,
+    0.479364, 0.462034, 0.697475, 0.562808, 0.346888, 0.158948, 0.458771,
+    0.110499, 0.258939, 0.199830, 0.432078, 0.989924, 0.144521, 0.683890,
+    0.834385, 0.668908, 0.011949, 0.687091, 0.364081, 0.408556, 0.238572,
+    0.183015, 0.812466, 0.897842, 0.429294, 0.124271, 0.253680, 0.815207,
+    0.459688, 0.439618, 0.961541, 0.939053, 0.901651, 0.659016, 0.501861,
+    0.248539, 0.817964, 0.960632, 0.359038, 0.076903, 0.160462, 0.791117,
+    0.066826, 0.304983, 0.475007, 0.901211, 0.973891, 0.486955, 0.588302,
+    0.337972, 0.895512, 0.826874, 0.520987, 0.707978, 0.724716, 0.950281,
+    0.832249, 0.978396, 0.765488, 0.291937, 0.418014, 0.727029, 0.230990,
+    0.319665, 0.386045, 0.732850, 0.568204, 0.204009, 0.693482, 0.927242,
+    0.280912, 0.853944, 0.718359, 0.347738, 0.158927, 0.193366, 0.248950,
+    0.132818, 0.680321, 0.837252, 0.470790, 0.575833, 0.664126, 0.991777,
+    0.283811, 0.388843, 0.942058, 0.116060, 0.367239, 0.707546, 0.407997,
+    0.785253, 0.434575, 0.638986, 0.104917, 0.820620, 0.371837, 0.673121,
+    0.024629, 0.065319, 0.600363, 0.305541, 0.919263, 0.318722, 0.653279,
+    0.078190, 0.512088, 0.902229, 0.211009, 0.192409, 0.739480, 0.681799,
+    0.768242, 0.403607, 0.673576, 0.052052, 0.792450, 0.615634, 0.168112,
+    0.159689, 0.323180, 0.576109, 0.944941, 0.757755, 0.215095, 0.049858,
+    0.578375, 0.586932, 0.722979, 0.603003, 0.652251, 0.323343, 0.908544,
+    0.571514, 0.642065, 0.561823, 0.649704, 0.154153, 0.464051, 0.860713,
+    0.346562, 0.203532, 0.542512, 0.114804, 0.607139, 0.216088, 0.166856,
+    0.399588, 0.831722, 0.334968, 0.559277, 0.154902, 0.911077, 0.504218,
+    0.912656, 0.126172, 0.554076, 0.491031, 0.713104, 0.277055, 0.094034,
+    0.365355, 0.600398, 0.002578, 0.936869, 0.242463, 0.564401, 0.586574,
+    0.396616, 0.028452, 0.447287, 0.743178, 0.231984, 0.989799, 0.857982,
+    0.839122, 0.205887, 0.024838, 0.238711, 0.037608, 0.359806, 0.797987,
+    0.192510, 0.270883, 0.302205, 0.105166, 0.397055, 0.856281, 0.596197,
+    0.110160, 0.133336, 0.690231, 0.475515, 0.733734, 0.692809, 0.412384,
+    0.976196, 0.257209, 0.998958, 0.372812, 0.285661, 0.446245, 0.115990,
+    0.517645, 0.436044, 0.973972, 0.356767, 0.641930, 0.998810, 0.595478,
+    0.679539, 0.358617, 0.393465, 0.872049, 0.629500, 0.695670, 0.977215,
+    0.026555, 0.551951, 0.573412, 0.136715, 0.685287, 0.263643, 0.612229,
+    0.419020, 0.956451, 0.024613, 0.395216, 0.213661, 0.023572, 0.768029,
+    0.499322, 0.469816, 0.884019, 0.016967, 0.905860, 0.857991, 0.373734,
+    0.547791, 0.856802, 0.969211, 0.227330, 0.215418, 0.362676, 0.099378,
+    0.844918, 0.058346, 0.076594, 0.871473, 0.610297, 0.650006, 0.008188,
+    0.295583, 0.913648, 0.620417, 0.714603, 0.870100, 0.645031, 0.109820,
+    0.083760, 0.668602, 0.877849, 0.583082, 0.138419, 0.761868, 0.600049,
+    0.044279, 0.619859, 0.973783, 0.592069, 0.476661, 0.942994, 0.819399,
+    0.692079, 0.305670, 0.918778, 0.536997, 0.364016, 0.995371, 0.408470,
+    0.974313, 0.645377, 0.416658, 0.269896, 0.559025, 0.037075, 0.984499,
+    0.429125, 0.682105, 0.094319, 0.512885, 0.350707, 0.972168, 0.095967,
+    0.489126, 0.734035, 0.696016, 0.533405, 0.353894, 0.669799, 0.125474,
+    0.830555, 0.612793, 0.944873, 0.522634, 0.918463, 0.863651, 0.059631,
+    0.282479, 0.859022, 0.468101, 0.256791, 0.504398, 0.884758, 0.526687,
+    0.063423, 0.921833, 0.511186, 0.492548, 0.603939, 0.605505, 0.005433,
+    0.954646, 0.577673, 0.101400, 0.443772, 0.311708, 0.797417, 0.977176,
+    0.665602, 0.467216, 0.102650, 0.496157, 0.080009, 0.047524, 0.018791,
+    0.998471, 0.911174, 0.078422, 0.280950, 0.770196, 0.546523, 0.537741,
+    0.274594, 0.431281, 0.064428, 0.338017, 0.353115, 0.575615, 0.830565,
+    0.957053, 0.181120, 0.835998, 0.911699, 0.758793, 0.937398, 0.355471,
+    0.070501, 0.734815, 0.332647, 0.736103, 0.202031, 0.435297, 0.232261,
+    0.282039, 0.482821, 0.251052, 0.280511, 0.393995, 0.329474, 0.561460,
+    0.164191, 0.875997, 0.099202, 0.438785, 0.307278, 0.163630, 0.776802,
+    0.660393, 0.739244, 0.607367, 0.617446, 0.920364, 0.443365, 0.529145,
+    0.679157, 0.380763, 0.884616, 0.749658, 0.115578, 0.217263, 0.485761,
+    0.317609, 0.652560, 0.718021, 0.599648, 0.135381, 0.969073, 0.880159,
+    0.529376, 0.298547, 0.441619, 0.693567, 0.174544, 0.540821, 0.132351,
+    0.481822, 0.704450, 0.909153, 0.142215, 0.443695, 0.516520, 0.759661,
+    0.364059, 0.959885, 0.288806, 0.043216, 0.340648, 0.173422, 0.792874,
+    0.456226, 0.390685, 0.278634, 0.773834, 0.043245, 0.996656, 0.373483,
+    0.178625, 0.965729, 0.253641, 0.708001, 0.264276, 0.695260, 0.401568,
+    0.438820, 0.236081, 0.533919, 0.920642, 0.940531, 0.443072, 0.062857,
+    0.384226, 0.959592, 0.822518, 0.748285, 0.919477, 0.111325, 0.791501,
+    0.260124, 0.284747, 0.584375, 0.716350, 0.675431, 0.863009, 0.490184,
+    0.718676, 0.859665, 0.863666, 0.897301, 0.825393, 0.117308, 0.605302,
+    0.089669, 0.812568, 0.006870, 0.528489, 0.048649, 0.540788, 0.449131,
+    0.989180, 0.983860, 0.511988, 0.373407, 0.943452, 0.334506, 0.121692,
+    0.862929, 0.445831, 0.913193, 0.123053, 0.730578, 0.497568, 0.839402,
+    0.406009, 0.360577, 0.329586, 0.124685, 0.220241, 0.193253, 0.021986,
+    0.045634, 0.310560, 0.627288, 0.135303, 0.123128, 0.634158, 0.663792,
+    0.171777, 0.174946, 0.112923, 0.160958, 0.158806, 0.624911, 0.534364,
+    0.102259, 0.959418, 0.656056, 0.965187, 0.405249, 0.569249, 0.088240,
+    0.135827, 0.066817, 0.927642, 0.541836, 0.427393, 0.257229, 0.666520,
+    0.647634, 0.450481, 0.688506, 0.693269, 0.761042, 0.315794, 0.828572,
+    0.884170, 0.949952, 0.492364, 0.055947, 0.124898, 0.605288, 0.216905,
+    0.283705, 0.230199, 0.751269, 0.385963, 0.189616, 0.407326, 0.351151,
+    0.594865, 0.976575, 0.439391, 0.730692, 0.043392, 0.367033, 0.272527,
+    0.470785, 0.624261, 0.939048, 0.118419, 0.074743, 0.627554, 0.811688,
+    0.835784, 0.943348, 0.640260, 0.719954, 0.893300, 0.132625, 0.775901,
+    0.018199, 0.737913, 0.992806, 0.301903, 0.968111, 0.744076, 0.687867,
+    0.157728, 0.151401, 0.039017, 0.752593, 0.127976, 0.478408, 0.483284,
+    0.171368, 0.845441, 0.755811, 0.642153, 0.469702, 0.694859, 0.760572,
+    0.544445, 0.322413, 0.572260, 0.380229, 0.265761, 0.212521, 0.100183,
+    0.159062, 0.345146, 0.876084, 0.177261, 0.083058, 0.868891, 0.479164,
+    0.051169, 0.612966, 0.167030, 0.208897, 0.764367, 0.206048, 0.961490,
+    0.892343, 0.684456, 0.444774, 0.063711, 0.529896, 0.200585, 0.705863,
+    0.999598, 0.895444, 0.466435, 0.544043, 0.217857, 0.038696, 0.924272,
+    0.483618, 0.251217, 0.024455, 0.642680, 0.596362, 0.900539, 0.819941,
+    0.679420, 0.769430, 0.299105, 0.730590, 0.382396, 0.466135, 0.939487,
+    0.146763, 0.672183, 0.900977, 0.039106, 0.356638, 0.345750, 0.102817,
+    0.886535, 0.546336, 0.808681, 0.886133, 0.441780, 0.275116, 0.430176,
+    0.659637, 0.313812, 0.354448, 0.143255, 0.565028, 0.378903, 0.785935,
+    0.161391, 0.279443, 0.605876, 0.840811, 0.048873, 0.904980, 0.571401,
+    0.431269, 0.371115, 0.510887, 0.578032, 0.043298, 0.411864, 0.617138,
+    0.399936, 0.757614, 0.719955, 0.286471, 0.303950, 0.528636, 0.172604,
+    0.745730, 0.803752, 0.602780, 0.405367, 0.117564, 0.957228, 0.548622,
+    0.682592, 0.336131, 0.334557, 0.843983, 0.615574, 0.940433, 0.684794,
+    0.664447, 0.845413, 0.256194, 0.095715, 0.216529, 0.767082, 0.673747,
+    0.259827, 0.178946, 0.290885, 0.659763, 0.936560, 0.010840, 0.946234,
+    0.240510, 0.539476, 0.118838, 0.986240, 0.343228, 0.721618, 0.391606,
+    0.460792, 0.678846, 0.940228, 0.143384, 0.014977, 0.274785, 0.987367,
+    0.630551, 0.215218, 0.672161, 0.294998, 0.060631, 0.928355, 0.390713,
+    0.277160, 0.695436, 0.064460, 0.536987, 0.874382, 0.355345, 0.196751,
+    0.810942, 0.366185, 0.142985, 0.051452, 0.905661, 0.261823, 0.037691,
+    0.248889, 0.983441, 0.429297, 0.709681, 0.662286, 0.369525, 0.853066,
+    0.677263, 0.644310, 0.840433, 0.307814, 0.859528, 0.512593, 0.602812,
+    0.920160, 0.440948, 0.993525, 0.197320, 0.136384, 0.057984, 0.734307,
+    0.010766, 0.413329, 0.931058, 0.821707, 0.779514, 0.074043, 0.873159,
+    0.685175, 0.335865, 0.910850, 0.934065, 0.319306, 0.340147, 0.643746,
+    0.981592, 0.709673, 0.496812, 0.658856, 0.353983, 0.337245, 0.966670,
+    0.213511, 0.849838, 0.569482, 0.133671, 0.290786, 0.563007, 0.330991,
+    0.427170, 0.620991, 0.065299, 0.437936, 0.034320, 0.996356, 0.259643,
+    0.813834, 0.070399, 0.132802, 0.499009, 0.406265, 0.043652, 0.433074,
+    0.725570, 0.383800, 0.076820, 0.707163, 0.093473, 0.573632, 0.366018,
+    0.447456, 0.910877, 0.332688, 0.660967, 0.760714, 0.902170, 0.794638,
+    0.051500, 0.465177, 0.125630, 0.478670, 0.086168, 0.190928, 0.916605,
+    0.120488, 0.187285, 0.176248, 0.934322, 0.257684, 0.309050, 0.433331,
+    0.663949, 0.352703, 0.866405, 0.389519, 0.736502, 0.943226, 0.096682,
+    0.829975, 0.516858, 0.462700, 0.277430, 0.427734, 0.795388, 0.938398,
+    0.188449, 0.697558, 0.733036, 0.239948, 0.162735, 0.858666, 0.718618,
+    0.248903, 0.049594, 0.635223, 0.369391, 0.236879, 0.811472, 0.303713,
+    0.494563, 0.120522, 0.737044, 0.158511, 0.473225, 0.603450, 0.548030,
+    0.209727, 0.546675, 0.644712, 0.039702, 0.063533, 0.107412, 0.317132,
+    0.491267, 0.902800, 0.255530, 0.679716, 0.600359, 0.988566, 0.919664,
+    0.763094, 0.847232, 0.638283, 0.011997, 0.896825, 0.273506, 0.381388,
+    0.133704, 0.084978, 0.685101, 0.628267, 0.205500, 0.422145, 0.786778,
+    0.678725, 0.025595, 0.334808, 0.888452, 0.572271, 0.979520, 0.928154,
+    0.635804, 0.086932, 0.245286, 0.127071, 0.989732, 0.500816, 0.806787,
+    0.590091, 0.489382, 0.726451, 0.353185, 0.336614, 0.364734, 0.365182,
+    0.233439, 0.638240, 0.746570, 0.367143, 0.723218, 0.431671, 0.995410,
+    0.928718, 0.853816, 0.782188, 0.607442, 0.879411, 0.116995, 0.495894,
+    0.451682, 0.096515, 0.424048, 0.087485, 0.183447, 0.669334, 0.214556,
+    0.173179, 0.170151, 0.021343, 0.763269, 0.659533, 0.747794, 0.116454,
+    0.996147, 0.112528, 0.481635, 0.229586, 0.750768, 0.228205, 0.596730,
+    0.473985, 0.659876, 0.592139, 0.402703, 0.513692, 0.374327, 0.010145,
+    0.393103, 0.491322, 0.506039, 0.844785, 0.587837, 0.930088, 0.932270,
+    0.771284, 0.599422, 0.146826, 0.944463, 0.769573, 0.168169, 0.707732,
+    0.429106, 0.915964, 0.824186, 0.425253, 0.028492, 0.305821, 0.654839,
+    0.779259, 0.534026, 0.251569, 0.253245, 0.193901, 0.843708, 0.655947,
+    0.707593, 0.218035, 0.666093, 0.100696, 0.709357, 0.172132, 0.945481,
+    0.297195, 0.102220, 0.877751, 0.068479, 0.701642, 0.024577, 0.012941,
+    0.471215, 0.192747, 0.720673, 0.900321, 0.108710, 0.544859, 0.325574,
+    0.137202, 0.850679, 0.980413, 0.916462, 0.384705, 0.231982, 0.169706,
+    0.578607, 0.075690, 0.825654, 0.286200, 0.293725, 0.491746, 0.386896,
+    0.003083, 0.663878, 0.332377, 0.300278, 0.766098, 0.210128, 0.368756,
+    0.467740, 0.234705, 0.381697, 0.938955, 0.427451, 0.102370, 0.839275,
+    0.536162, 0.647229, 0.164849, 0.673364, 0.497908, 0.145262, 0.589825,
+    0.882613, 0.377244, 0.759532, 0.461220, 0.452934, 0.585185, 0.747420,
+    0.746660, 0.076932, 0.134316, 0.749743, 0.740810, 0.466692, 0.050020,
+    0.506908, 0.676820, 0.418776, 0.974648, 0.911525, 0.800474, 0.913602,
+    0.338976, 0.902844, 0.752878, 0.875138, 0.550072, 0.917727, 0.548502,
+    0.047981, 0.062989, 0.138327, 0.930594, 0.440233, 0.897859, 0.391814,
+    0.893168, 0.483044, 0.139234, 0.639828, 0.559975, 0.273549, 0.389570,
+    0.300785, 0.740242, 0.439590, 0.807693, 0.417062, 0.858367, 0.782341,
+    0.328586, 0.658840, 0.695943, 0.667562, 0.561684, 0.448821, 0.542700,
+    0.111756, 0.366548, 0.091202, 0.159737, 0.429537, 0.229529, 0.090331,
+    0.869770, 0.127388, 0.482145, 0.762938, 0.610432, 0.621379, 0.402765,
+    0.170407, 0.894928, 0.792336, 0.471192, 0.635170, 0.231926, 0.278886,
+    0.052232, 0.090293, 0.061226, 0.380818, 0.749133, 0.757170, 0.048380,
+    0.310817, 0.205990, 0.591080, 0.422573, 0.572538, 0.682282, 0.582310,
+    0.002075, 0.911812, 0.672641, 0.871845, 0.039199, 0.154786, 0.634783,
+    0.649631, 0.776165, 0.037548, 0.820038, 0.671093, 0.829884, 0.291231,
+    0.306263, 0.061810, 0.570116, 0.358495, 0.152103, 0.631343, 0.739313,
+    0.901236, 0.388512, 0.787693, 0.212053, 0.594503, 0.378773, 0.634626,
+    0.167040, 0.061056, 0.216937, 0.169115, 0.972867, 0.889578, 0.040960,
+    0.012067, 0.044364, 0.675743, 0.661698, 0.820529, 0.713291, 0.481736,
+    0.491623, 0.543175, 0.772966, 0.797886, 0.604985, 0.343083, 0.156380,
+    0.757088, 0.974425, 0.895693, 0.658324, 0.362938, 0.683386, 0.870376,
+    0.957440, 0.062159, 0.505002, 0.124481, 0.123215, 0.721939, 0.293596,
+    0.096082, 0.611517, 0.334556, 0.108149, 0.655881, 0.010299, 0.769846,
+    0.476411, 0.723590, 0.251582, 0.968033, 0.266765, 0.024548, 0.765919,
+    0.871750, 0.367631, 0.922299, 0.628838, 0.342056, 0.817992, 0.287162,
+    0.704994, 0.501378, 0.157538, 0.662434, 0.563537, 0.662541, 0.786915,
+    0.686752, 0.384480, 0.080511, 0.782834, 0.995997, 0.415067, 0.890983,
+    0.651878, 0.425365, 0.660829, 0.128289, 0.148956, 0.912411, 0.096322,
+    0.415721, 0.936959, 0.862241, 0.287471, 0.304590, 0.784540, 0.916309,
+    0.646646, 0.602533, 0.203471, 0.351640, 0.103911, 0.361009, 0.014074,
+    0.667448, 0.023550, 0.800989, 0.354200, 0.408030, 0.881500, 0.137034,
+    0.404026, 0.296566, 0.028017, 0.055904, 0.721932, 0.688846, 0.184193,
+    0.870887, 0.601257, 0.280515, 0.286608, 0.538216, 0.142755, 0.574079,
+    0.842806, 0.927296, 0.490388, 0.489452, 0.529828, 0.693859, 0.841092,
+    0.633739, 0.054869, 0.855167, 0.301187, 0.078419, 0.656156, 0.655388,
+    0.486448, 0.537656, 0.792422, 0.890475, 0.834222, 0.820439, 0.946379,
+    0.556153, 0.509285, 0.130571, 0.427041, 0.110542, 0.411086, 0.713648,
+    0.648758, 0.553842, 0.287727, 0.491563, 0.481137, 0.778116, 0.981015,
+    0.010966, 0.471975, 0.822107, 0.644705, 0.526844, 0.677274, 0.945892,
+    0.605263, 0.333430, 0.601280, 0.091711, 0.871086, 0.393702, 0.982186,
+    0.705307, 0.214141, 0.928564, 0.261461, 0.723426, 0.059136, 0.688501,
+    0.833968, 0.470222, 0.402150, 0.482725, 0.024063, 0.689877, 0.974289,
+    0.505201, 0.467993, 0.955304, 0.516166, 0.939968, 0.777411, 0.160871,
+    0.466812, 0.454685, 0.106763, 0.072075, 0.788115, 0.708043, 0.163786,
+    0.659201, 0.101744, 0.145971, 0.364508, 0.315885, 0.074536, 0.625969,
+    0.039311, 0.133672, 0.314471, 0.873279, 0.603893, 0.716620, 0.356004,
+    0.627957, 0.406498, 0.330292, 0.133157, 0.874490, 0.285596, 0.649324,
+    0.814458, 0.063007, 0.810195, 0.281270, 0.517693, 0.916958, 0.353345,
+    0.305808, 0.625000, 0.517131, 0.965009, 0.726745, 0.663102, 0.329518,
+    0.042630, 0.737638, 0.955487, 0.081940, 0.871310, 0.269957, 0.955219,
+    0.475203, 0.986578, 0.311223, 0.103160, 0.393075, 0.641515, 0.236317,
+    0.267566, 0.927112, 0.885641, 0.082024, 0.990119, 0.695835, 0.363295,
+    0.507812, 0.612793, 0.716640, 0.813620, 0.237793, 0.233770, 0.778629,
+    0.964538, 0.896872, 0.108147, 0.007167, 0.634510, 0.063633, 0.089108,
+    0.505820, 0.333591, 0.044327, 0.981023, 0.320168, 0.355550, 0.084182,
+    0.713244, 0.997065, 0.320499, 0.980810, 0.924177, 0.206140, 0.062834,
+    0.914296, 0.901975, 0.426129, 0.422107, 0.514768, 0.142768, 0.235727,
+    0.752561, 0.376539, 0.014356, 0.717099, 0.273411, 0.122502, 0.724266,
+    0.907921, 0.186136, 0.813374, 0.413741, 0.519726, 0.857701, 0.394764,
+    0.839895, 0.213251, 0.478946, 0.553139, 0.210317, 0.799446, 0.533948,
+    0.134493, 0.005586, 0.596782, 0.048789, 0.907561, 0.022911, 0.470896,
+    0.422329, 0.165679, 0.706623, 0.174890, 0.542218, 0.720979, 0.891989,
+    0.815629, 0.843481, 0.616255, 0.723551, 0.029617, 0.429630, 0.137292,
+    0.549343, 0.287331, 0.532056, 0.389238, 0.500583, 0.011002, 0.942377,
+    0.710899, 0.810448, 0.476326, 0.845392, 0.816033, 0.073108, 0.894181,
+    0.723594, 0.096019, 0.365077, 0.145923, 0.261699, 0.071700, 0.320813,
+    0.803917, 0.792679, 0.212802, 0.619546, 0.636160, 0.829057, 0.343096,
+    0.665777, 0.258687, 0.480388, 0.215121, 0.546018, 0.012444, 0.604359,
+    0.046601, 0.023446, 0.546736, 0.757500, 0.833893, 0.023062, 0.602892,
+    0.649927, 0.096170, 0.497074, 0.373521, 0.192189, 0.862151, 0.519444,
+    0.453887, 0.933851, 0.840257, 0.257804, 0.726531, 0.053058, 0.877350,
+    0.362691, 0.882115, 0.220446, 0.028468, 0.140802, 0.700834, 0.243589,
+    0.686821, 0.713278, 0.847948, 0.733421, 0.736723, 0.394684, 0.490921,
+    0.570617, 0.417746, 0.093813, 0.220543, 0.513916, 0.590887, 0.594064,
+    0.706105, 0.453038, 0.113508, 0.159992, 0.386889, 0.953765, 0.417796,
+    0.113420, 0.006823, 0.295146, 0.476111, 0.888938, 0.515592, 0.504579,
+    0.029741, 0.216426, 0.748168, 0.716561, 0.929703, 0.596117, 0.449982,
+    0.666427, 0.990801, 0.940903, 0.237043, 0.408547, 0.034717, 0.457587,
+    0.922463, 0.625603, 0.051651, 0.628568, 0.078641, 0.165159, 0.788560,
+    0.465530, 0.118923, 0.206356, 0.578950, 0.125746, 0.501502, 0.055060,
+    0.014685, 0.017094, 0.559640, 0.044425, 0.233519, 0.307808, 0.760986,
+    0.163223, 0.903925, 0.210969, 0.829650, 0.894726, 0.151872, 0.066693,
+    0.303273, 0.186589, 0.524279, 0.225736, 0.812192, 0.575930, 0.854304,
+    0.890833, 0.741089, 0.642864, 0.356363, 0.860012, 0.849220, 0.935313,
+    0.985758, 0.350722, 0.990373, 0.000443, 0.367815, 0.550013, 0.044868,
+    0.601335, 0.857820, 0.805855, 0.764557, 0.761745, 0.016823, 0.594207,
+    0.656471, 0.168696, 0.660900, 0.959744, 0.355284, 0.185179, 0.185480,
+    0.167477, 0.761110, 0.039784, 0.058310, 0.502199, 0.682648, 0.414673,
+    0.362211, 0.531868, 0.349985, 0.347969, 0.882589, 0.340358, 0.348412,
+    0.250404, 0.890371, 0.393280, 0.851739, 0.748191, 0.199135, 0.616297,
+    0.509936, 0.215958, 0.210504, 0.166407, 0.384654, 0.871404, 0.126151,
+    0.739938, 0.056583, 0.311631, 0.907415, 0.817693, 0.351415, 0.965724,
+    0.319891, 0.034062, 0.380397, 0.682102, 0.565930, 0.730382, 0.030072,
+    0.448519, 0.070741, 0.378484, 0.698924, 0.961112, 0.771764, 0.550663,
+    0.709303, 0.970899, 0.166959, 0.219239, 0.186857, 0.377463, 0.385647,
+    0.571511, 0.248867, 0.511798, 0.311449, 0.305450, 0.823429, 0.218864,
+    0.123142, 0.174844, 0.184588, 0.443034, 0.208906, 0.564986, 0.125136,
+    0.774836, 0.295368, 0.155207, 0.223355, 0.366109, 0.533691, 0.922279,
+    0.327221, 0.305455, 0.472942, 0.036524, 0.276354, 0.639901, 0.255763,
+    0.463211, 0.017364, 0.641410, 0.034722, 0.266231, 0.153207, 0.346171,
+    0.571680, 0.976636, 0.565036, 0.694822, 0.151480, 0.749624, 0.137856,
+    0.360386, 0.314610, 0.262992, 0.135222, 0.609978, 0.418200, 0.358578,
+    0.976087, 0.951891, 0.280856, 0.303307, 0.257346, 0.753798, 0.339831,
+    0.533700, 0.393699, 0.595594, 0.996911, 0.411063, 0.237003, 0.031634,
+    0.677294, 0.390211, 0.377805, 0.248974, 0.366847, 0.942841, 0.943796,
+    0.518327, 0.692465, 0.081653, 0.878713, 0.007074, 0.344645, 0.013936,
+    0.617052, 0.762845, 0.372513, 0.593138, 0.714736, 0.653370, 0.896446,
+    0.972082, 0.407168, 0.236276, 0.505782, 0.800867, 0.831870, 0.502693,
+    0.211930, 0.068873, 0.534327, 0.889224, 0.459084, 0.912132, 0.138197,
+    0.825931, 0.854972, 0.081994, 0.344259, 0.547437, 0.163646, 0.222972,
+    0.554511, 0.508291, 0.236908, 0.171563, 0.271135, 0.609421, 0.764701,
+    0.985871, 0.262790, 0.661147, 0.957953, 0.669958, 0.897423, 0.463734,
+    0.470825, 0.729293, 0.966427, 0.682755, 0.798166, 0.500754, 0.571978,
+    0.257251, 0.412886, 0.710176, 0.083182, 0.267858, 0.792169, 0.427441,
+    0.815295, 0.955815, 0.650413, 0.369805, 0.464106, 0.887320, 0.541368,
+    0.735242, 0.496741, 0.306069, 0.721113, 0.759531, 0.967216, 0.679065,
+    0.429489, 0.864639, 0.142799, 0.900314, 0.593932, 0.109227, 0.583069,
+    0.392098, 0.609981, 0.155047, 0.649349, 0.022867, 0.865222, 0.732531,
+    0.290725, 0.657392, 0.159972, 0.106019, 0.613207, 0.810384, 0.475824,
+    0.077313, 0.697704, 0.017192, 0.812555};
+
+static float golden_endtoend_output[] = {
+    -1.881211, -0.028385, -3.585066, 1.939770,  -3.461155, 1.280415,  -4.408978,
+    0.608663,  -2.704937, 1.859742,  -5.777429, 2.691839,  -1.049012, 1.640870,
+    -4.856245, 1.604236,  0.992707,  0.422858,  -4.307465, 1.887332,  -0.884831,
+    -0.154277, -2.634801, 0.586827,  -1.849960, 1.399608,  -4.531559, 1.943591,
+    0.271676,  -2.893054, -2.066826, 0.235467,  -1.248263, -1.164534, -2.640174,
+    -0.112878, -4.386484, 1.253024,  -4.135623, 1.068984,  -0.043579, -0.832957,
+    -3.257258, -0.514396, -1.651174, 0.638630,  -4.364372, 1.548441,  -0.289455,
+    0.539845,  -4.097627, 0.635001,  -0.465071, -0.927701, -2.481498, 0.356616,
+    -2.355012, 0.728806,  -3.340283, 1.609038,  -4.786268, -0.532272, -1.886150,
+    0.254797,  0.746620,  -1.657134, -3.264265, 0.525551,  -1.756837, 0.845446,
+    -5.572190, 1.715797,  -2.856942, 3.394245,  -5.803662, 2.281806,  -3.014739,
+    2.616136,  -4.728482, 1.659984,  -2.106307, 2.711709,  -6.173832, 1.352869,
+    -0.038035, 0.107619,  -4.279774, 2.341930,  -0.980413, -0.119538, -4.049717,
+    1.172128,  -3.477744, 2.602274,  -6.231380, 2.537300,  -0.862214, 0.568722,
+    -3.858362, 0.197867,  -1.725885, 3.687312,  -7.067363, 2.403544,  -0.944963,
+    0.235639,  -3.250094, 0.659117,  -1.459576, 0.426128,  -3.637207, 1.030386,
+    -4.224351, 3.516220,  -6.053367, 0.993473,  -2.182416, -0.762625, -1.884405,
+    -0.113736, -2.572602, 0.329290,  -1.913233, 0.517418,  -0.019757, 0.203176,
+    -3.715881, 0.482136,  -1.912823, 1.357907,  -5.473043, 1.714658,  -3.177160,
+    0.089285,  -3.127669, 1.268076,  0.772498,  -1.622712, -3.850314, 0.436124,
+    -1.495983, 3.439982,  -7.623405, 1.726721,  -0.423979, 0.180201,  -2.902406,
+    0.986457,  -1.845638, 0.460903,  -5.359343, -1.133931, -1.074456, 0.717304,
+    -3.519856, 1.012126,  -0.562301, 1.881967,  -6.716627, 2.525036,  0.945480,
+    0.337081,  -5.210562, 2.572035,  -0.943370, 0.442026,  -2.666313, 0.411296,
+    0.002787,  -0.000735, -2.498933, 0.771719,  -3.568153, 3.833721,  -6.617026,
+    2.813922,  -0.573970, 1.025208,  -3.909923, 1.722648,  -1.406849, 0.719783,
+    -5.207438, 1.819442,  -0.530895, -0.010887, -2.939614, 0.971225,  -1.660297,
+    1.345243,  -4.454571, 2.244876,  -2.021213, 1.756090,  -4.880947, 0.364597,
+    -2.380270, 2.763117,  -5.613013, 2.137534,  0.289101,  -2.279400, -3.365582,
+    0.170028,  -1.142254, -0.709604, -3.656223, 1.804870,  -0.854690, 0.592102,
+    -5.010415, 2.462687,  -1.474710, 0.566002,  -3.621819, -0.391946, -0.423524,
+    -0.631428, -3.513310, 0.962825,  -1.480262, 0.319791,  -3.610137, 1.842339,
+    -0.250073, 1.182022,  -6.249267, 1.604172,  1.153759,  -0.734054, -4.620415,
+    -0.030858, 0.050911,  1.524406,  -4.724010, 1.451846,  -3.277104, 2.414182,
+    -4.605285, 1.846092,  -1.503047, -0.618200, -2.746546, -0.459332, -0.980326,
+    -1.199977, -2.043865, -0.165793, -2.214698, 3.108281,  -7.127830, -0.123065,
+    1.244948,  -3.039923, -4.660061, -0.225957, -0.307210, -1.513205, -2.456005,
+    0.840048,  -0.741445, 2.328635,  -6.015267, 2.723240,  -1.381171, -0.728878,
+    -5.114925, -0.362034, -0.574923, 0.518080,  -3.892457, 1.798948,  0.435119,
+    -0.371696, -2.807571, 1.302864,  -2.063052, 1.036388,  -4.232038, 1.397059,
+    -1.615668, -1.511019, -3.095508, 1.290955,  -3.428723, 2.000287,  -4.196487,
+    1.566983,  0.196957,  0.224343,  -4.926359, -0.691975, -0.214941, 1.546821,
+    -5.384868, 2.290820,  -1.878865, 0.493692,  -4.129823, 2.112036,  0.516558,
+    -2.553077, -2.717338, 0.017146,  -2.016057, 1.628995,  -4.240602, 1.189533,
+    -5.460220, 1.254738,  -4.214903, 0.755659,  -2.893235, 2.937762,  -6.169453,
+    2.035456,  -5.613212, -0.122254, -1.973646, -0.060619, -2.119598, 1.413512,
+    -4.938738, 1.890244,  0.544169,  -2.062413, -3.329637, -0.062515, -1.855805,
+    -0.791297, -2.570353, 0.607615,  0.305812,  0.338930,  -4.150270, 2.274937,
+    0.042653,  0.133825,  -3.538155, 1.523639,  -3.173690, -1.496599, -2.414655,
+    0.464687,  -1.448998, -0.368907, -3.520129, 0.203382,  -2.443626, 1.266233,
+    -3.393848, 0.605911,  -0.015353, 1.402006,  -4.441003, 1.419281,  0.603587,
+    0.434146,  -4.966566, 2.171872,  -0.688264, -0.009981, -4.461103, 1.538354,
+    -5.029816, -0.264424, -1.713510, -0.315258, -1.891606, 0.252074,  -2.419428,
+    0.043970,  -1.291143, 2.048704,  -4.590105, 0.524734,  -1.889576, 0.134836,
+    -3.462745, 1.390663,  -0.112773, 0.402735,  -4.203784, 1.381043,  -1.201634,
+    -1.968277, -1.425637, -0.181725, -1.250742, -2.102041, -3.925464, -1.256797,
+    -3.701354, -1.754610, -1.917231, -1.455910, -1.838006, 2.041781,  -5.666212,
+    2.752957,  -2.659553, 2.553637,  -4.872212, 1.443437,  -2.081846, 3.311263,
+    -5.912457, 1.871049,  0.196148,  -0.307044, -4.024967, 2.149149,  0.361809,
+    0.620415,  -5.939984, 0.180672,  -1.209180, -0.269122, -3.240285, 1.460315,
+    -1.040803, 1.125700,  -6.060366, 0.887767,  -3.214111, 1.314368,  -3.026808,
+    1.023640,  -3.815175, 1.795642,  -4.355603, 1.064454,  -0.046472, 0.618463,
+    -5.941646, 2.861891,  -2.852155, -0.990457, -2.624445, 1.794494,  -1.176747,
+    -0.358159, -3.206776, 1.138721,  -2.819523, -1.825522, -1.450902, -0.187312,
+    -0.808727, 0.636872,  -4.120567, 1.192623,  0.810731,  -1.768519, -3.699450,
+    1.527116,  -2.772720, 3.012835,  -5.912736, 1.599365,  -4.696381, 2.234591,
+    -4.139552, 1.061768,  -1.880089, 3.596274,  -7.006379, 2.382152,  -3.158115,
+    3.844430,  -7.044156, 2.307596,  -2.473970, 1.312644,  -5.467269, 0.197154,
+    -1.530040, 1.762275,  -5.550757, 0.630276,  -3.048947, 1.043777,  -3.096658,
+    1.345893,  -1.329494, 2.065748,  -4.711032, 2.227600,  -0.413321, -0.032428,
+    -4.599650, 1.668734,  -4.351490, -0.200022, -2.359903, 0.021997,  0.116028,
+    1.159718,  -5.093972, -0.142951, -2.409895, 0.906133,  -2.728812, 0.809932,
+    -2.597363, 0.494130,  -2.357861, 0.369825,  -2.165235, 1.148522,  -3.130562,
+    0.759034,  0.646335,  -1.463660, -3.508299, 1.059679,  -1.485465, 1.007319,
+    -4.340716, 1.789864,  -1.590654, 1.612324,  -4.452007, 2.389805,  -5.200148,
+    -1.068398, -1.306923, -0.472408, -0.392165, -0.524996, -2.933478, 1.518430,
+    -1.287781, 0.113422,  -3.020525, 1.338359,  -0.105982, 0.936014,  -4.132197,
+    1.836807,  -0.616589, -1.029716, -3.271347, 0.284889,  -2.653359, 2.135829,
+    -4.643613, 1.627981,  0.287733,  -2.017263, -2.776574, 1.184792,  1.004161,
+    -1.483019, -4.339290, -0.787322, 0.582420,  1.137839,  -5.673941, -0.001862,
+    -1.219142, 0.532561,  -4.457245, 1.826807,  -3.343291, 3.034610,  -6.179855,
+    2.235917,  -4.369989, 4.018128,  -6.632714, 0.926585,  -0.485469, 0.536073,
+    -4.179557, 1.489637,  -0.521762, 1.636089,  -6.137912, 1.500867,  -4.086009,
+    1.961372,  -3.688977, 1.358220,  -1.544034, 1.763837,  -4.357567, 1.852201,
+    -2.018725, 1.046264,  -6.211127, 1.609419,  -0.118441, 1.602284,  -6.242423,
+    1.518578,  -0.604078, 1.106613,  -5.393445, 2.595629,  0.142712,  -1.903953,
+    -2.821177, 0.032758,  -0.009152, 0.184628,  -4.227636, 2.046843,  -2.240138,
+    1.256176,  -5.108516, -0.308447, -2.998571, 4.657396,  -7.582112, 2.510951,
+    -3.535784, 1.704560,  -5.068484, 1.318466,  -3.058265, 3.073172,  -6.998089,
+    3.178849,  -2.420286, 2.277806,  -4.999528, 1.423890,  -1.672914, 0.447460,
+    -4.088940, 1.351087,  -1.051546, -0.417955, -4.042147, 1.604102,  -1.700931,
+    2.796663,  -6.497579, 2.857974,  -0.240828, 0.858001,  -5.778933, 2.778508,
+    -0.406211, 1.300766,  -5.073671, 2.089362,  -0.201673, 1.588396,  -6.000150,
+    2.185055,  -2.332125, 0.768216,  -2.609184, 0.327277,  -3.358943, -1.020736,
+    -2.389984, 0.315512,  -0.561905, 1.948740,  -6.408485, 2.231985,  -0.603652,
+    0.661829,  -5.070386, -1.063058, -0.624796, 1.375772,  -4.379606, 1.929358,
+    -1.047263, 0.739100,  -5.217857, 2.127625,  -5.025338, 0.650344,  -2.068460,
+    0.076936,  -0.457505, -1.050984, -1.917765, 1.150908,  0.782625,  0.855595,
+    -5.321719, 0.787209,  -0.460232, 1.106736,  -5.552326, 2.801043,  -0.360217,
+    -0.434432, -4.273378, 0.967556,  -0.972652, 0.874811,  -5.429918, -0.331039,
+    0.115477,  0.111883,  -5.418786, 1.240546,  -1.842794, 0.505880,  -3.676064,
+    -0.682369, 1.858984,  -0.742566, -5.784060, 0.673239,  -1.280398, 0.280842,
+    -4.848077, 2.214860,  -0.785100, -0.588488, -2.438206, 0.786651,  -1.568752,
+    1.935400,  -6.320256, 2.125338,  -1.476457, -1.651941, -2.695734, 0.007338,
+    -3.280860, 2.310385,  -5.319578, 1.890123,  -0.775723, 0.630606,  -4.321582,
+    1.085521,  -1.847371, 1.188521,  -4.596577, 2.056443,  -2.340172, -0.108501,
+    -3.156392, 0.933279,  -0.495331, 0.122405,  -5.171133, 1.763245,  -0.796913,
+    2.310487,  -7.247197, 2.401678,  -1.908860, 0.043798,  -2.393796, 0.573806,
+    -0.608531, 0.154710,  -4.669001, 0.750680,  0.468380,  0.392591,  -4.755001,
+    2.615217,  -1.957774, 1.153513,  -4.530099, 1.124362,  -3.569415, 1.697154,
+    -3.536335, 0.910758,  -2.976264, 1.833129,  -4.287203, -0.547050, -2.409768,
+    0.061585,  -1.324116, 0.268497,  -2.962222, -1.524245, -2.063413, 0.442058,
+    -4.292337, 3.538863,  -6.699603, 1.718664,  -2.290363, 1.994596,  -6.245037,
+    -0.433084, -0.367059, 1.020297,  -4.940721, 2.902264,  -0.577056, -0.709887,
+    -5.001413, -0.268316, -1.112048, -1.083307, -1.753492, 0.209973,  0.139540,
+    0.917602,  -5.232745, 2.538467,  -2.139234, -0.187388, -1.837249, -0.478582,
+    -0.731653, -0.481550, -2.531261, 1.044770,  0.707750,  0.279971,  -3.221119,
+    1.552074,  -2.373144, 0.859518,  -3.665156, 1.620278,  -1.440871, -0.525581,
+    -2.758271, 1.491873,  -2.302013, 1.119935,  -5.257080, 2.627170,  -3.174739,
+    1.363282,  -4.831639, 1.101076,  -4.337008, 2.689639,  -5.165915, 1.069201,
+    -1.882078, -0.120370, -2.287967, 1.147619,  -1.403616, 1.077150,  -5.084296,
+    1.658236,  -0.919642, 0.487423,  -3.001075, 0.741268,  0.107300,  0.943556,
+    -3.544311, 1.000239,  -1.627171, 2.871253,  -5.179172, 1.429893,  -0.826040,
+    0.188670,  -4.499894, 1.013447,  -2.101299, 0.317516,  -3.452141, -0.833776,
+    -1.362144, 1.272437,  -4.449355, 1.613591,  -2.039873, 2.613175,  -6.229640,
+    1.659790,  -1.595520, -0.237462, -2.744997, 0.337841,  0.148981,  -1.703771,
+    -2.388023, 1.276469,  1.058508,  -0.401642, -4.680769, 0.861881,  -1.336381,
+    1.153080,  -2.834378, 0.721075,  0.900115,  1.360511,  -5.573611, 0.949182,
+    -2.970844, 2.017563,  -5.186108, -0.201038, -1.192824, 0.610142,  -4.450919,
+    -0.897114, -1.812093, 0.422310,  -5.245487, 0.256549,  0.320275,  -2.324150,
+    -2.967040, -0.260536, -0.721467, 0.454148,  -5.058031, 0.526370,  -0.895656,
+    0.732240,  -3.327363, 1.353953,  -1.277912, -0.483171, -1.926713, 0.065044,
+    -2.167506, -0.196606, -1.923437, 0.604962,  -2.088319, 1.406834,  -5.227296,
+    2.247351,  -4.421744, 1.729791,  -5.007922, 1.264769,  -0.897019, 0.922902,
+    -3.887108, 2.087432,  -1.310226, -0.101938, -3.359082, -0.079662, -0.514988,
+    -0.963179, -4.038209, 2.223278,  -0.590083, -2.310458, -1.748338, 0.363406,
+    -0.540731, -0.885913, -4.179595, 2.216781,  -3.044339, -0.447100, -2.446098,
+    0.931101,  -1.676190, 2.096175,  -4.980755, 2.262151,  -1.095047, 1.897516,
+    -5.996138, 2.191038,  0.297128,  -0.780974, -2.884299, 1.195408,  -0.521065,
+    -1.955837, -3.091064, -0.404183, -1.961519, 4.076096,  -7.521851, 2.242064,
+    -1.988043, 0.303300,  -2.422585, 0.322230,  -3.377634, 3.499955,  -7.084434,
+    2.375587,  -0.718851, 2.150076,  -5.412241, 2.374280,  -2.006088, 2.229828,
+    -5.848188, 2.543077,  -2.171042, 2.096026,  -5.300007, 0.141405,  -1.187745,
+    0.105340,  -4.003816, 1.034281,  -3.980804, 1.856709,  -5.103042, 0.623737,
+    -2.080307, 0.896140,  -3.104050, 0.983158,  -0.424898, -1.154270, -3.805728,
+    1.978917,  -1.314387, 1.235096,  -3.148906, 1.113173,  0.111713,  2.055213,
+    -7.565283, 2.100342};
+constexpr std::initializer_list<float> biases = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
+constexpr std::initializer_list<float> recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+class BidirectionalRNNOpModel : public SingleOpModel {
+ public:
+  BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
+                          int bw_units, int input_size)
+      : batches_(batches),
+        sequence_len_(sequence_len),
+        fw_units_(fw_units),
+        bw_units_(bw_units),
+        input_size_(input_size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    fw_weights_ = AddInput(TensorType_FLOAT32);
+    fw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    fw_bias_ = AddInput(TensorType_FLOAT32);
+    fw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    fw_output_ = AddOutput(TensorType_FLOAT32);
+    bw_weights_ = AddInput(TensorType_FLOAT32);
+    bw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    bw_bias_ = AddInput(TensorType_FLOAT32);
+    bw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    bw_output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+                 BuiltinOptions_SequenceRNNOptions,
+                 CreateSequenceRNNOptions(builder_, /*time_major=*/false,
+                                          ActivationFunctionType_RELU)
+                     .Union());
+    BuildInterpreter({
+        {batches_, sequence_len_, input_size_},  // input
+        {fw_units_, input_size_},                // fw_weights
+        {fw_units_, fw_units_},                  // fw_recurrent_weights
+        {fw_units_},                             // fw_bias
+        {bw_units_, input_size_},                // bw_weights
+        {bw_units_, bw_units_},                  // bw_recurrent_weights
+        {bw_units_}                              // bw_bias
+    });
+  }
+
+  void SetFwBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_bias_, f);
+  }
+
+  void SetBwBias(std::initializer_list<float> f) {
+    PopulateTensor(bw_bias_, f);
+  }
+
+  void SetFwWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_weights_, f);
+  }
+
+  void SetBwWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_weights_, f);
+  }
+
+  void SetFwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_weights_, f);
+  }
+
+  void SetBwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_recurrent_weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  void ResetHiddenStates() {
+    const int fw_zero_buffer_size = fw_units_ * batches_;
+    std::unique_ptr<float[]> fw_zero_buffer(new float[fw_zero_buffer_size]);
+    memset(fw_zero_buffer.get(), 0, fw_zero_buffer_size * sizeof(float));
+    PopulateTensor(fw_hidden_state_, 0, fw_zero_buffer.get(),
+                   fw_zero_buffer.get() + fw_zero_buffer_size);
+    const int bw_zero_buffer_size = bw_units_ * batches_;
+    std::unique_ptr<float[]> bw_zero_buffer(new float[bw_zero_buffer_size]);
+    memset(bw_zero_buffer.get(), 0, bw_zero_buffer_size * sizeof(float));
+    PopulateTensor(bw_hidden_state_, 0, bw_zero_buffer.get(),
+                   bw_zero_buffer.get() + bw_zero_buffer_size);
+  }
+
+  std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
+  std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
+
+  int input_size() { return input_size_; }
+  int num_fw_units() { return fw_units_; }
+  int num_bw_units() { return bw_units_; }
+  int num_batches() { return batches_; }
+  int sequence_len() { return sequence_len_; }
+
+ private:
+  int input_;
+  int fw_weights_;
+  int fw_recurrent_weights_;
+  int fw_bias_;
+  int fw_hidden_state_;
+  int fw_output_;
+  int bw_weights_;
+  int bw_recurrent_weights_;
+  int bw_bias_;
+  int bw_hidden_state_;
+  int bw_output_;
+
+  int batches_;
+  int sequence_len_;
+  int fw_units_;
+  int bw_units_;
+  int input_size_;
+};
+
+// TODO(mirkov): add another test which directly compares to TF once TOCO
+// supports the conversion from dynamic_rnn with BasicRNNCell.
+TEST(BidirectionalRNNOpTest, BlackBoxTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_fw_start = rnn_golden_fw_output;
+  float* golden_fw_end =
+      golden_fw_start + rnn.num_fw_units() * rnn.sequence_len();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* golden_bw_start = rnn_golden_bw_output;
+  float* golden_bw_end =
+      golden_bw_start + rnn.num_bw_units() * rnn.sequence_len();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Check that if the input sequence is reversed the outputs are the same just
+// forward and backward are swapped (and reversed).
+TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  // Reverse inputs in each batch: in_1, in_2,..., in_k is inserted in the
+  // following order: [in_k,..., in_2, in_1, in_k,...,in_2, in_1].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    const int reverse_idx = rnn.sequence_len() - i - 1;
+    rnn.SetInput(reverse_idx * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((rnn.sequence_len() + reverse_idx) * rnn.input_size(),
+                 batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  // The forward and backward outputs are swapped.
+  std::vector<float> fw_expected;  // consider using std::deque instead.
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.begin(), golden_fw_start, golden_fw_end);
+  }
+  fw_expected.insert(fw_expected.end(), fw_expected.begin(), fw_expected.end());
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  std::vector<float> bw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_bw_start = rnn_golden_fw_output + i * rnn.num_bw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_bw_units();
+    bw_expected.insert(bw_expected.begin(), golden_bw_start, golden_bw_end);
+  }
+  bw_expected.insert(bw_expected.end(), bw_expected.begin(), bw_expected.end());
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Tests an end-to-end neural network with a Bidirectional RNN followed by a
+// DNN that aggregates the outputs from the two sequences.
+TEST(BidirectionalRNNOpTest, EndToEndTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  const int output_size = 4;
+  float dnn_weights[] = {
+      -0.5782342,  -0.052212059, 0.73036242,  -0.81216097, -0.80088139,
+      -0.23420811, -0.39647382,  0.31423986,  0.61819065,  -0.73659575,
+      -0.89698344, -0.8931554,   -0.0845688,  0.5617367,   0.38415289,
+      -0.11487955, -0.7617774,   0.17927337,  0.15726972,  0.059798479,
+      0.19009054,  -0.27616632,  -0.39142907, 0.77744663,  -0.046830714,
+      -0.6603595,  0.21945822,   0.051494241, 0.23785079,  0.19239247,
+      -0.53268754, 0.65961659,   -0.85981959, -0.80232513, 0.84745562,
+      -0.66070104, -0.036533296, -0.54901814, 0.65353882,  -0.41834265,
+      -0.28561389, 0.75655544,   -0.31149811, 0.62981737,  0.31829214,
+      -0.92734522, -0.48506218,  0.55651462,  0.25192821,  0.67220747,
+      -0.3836869,  -0.55798125,  -0.60395885, 0.22488403,  -0.78053463,
+      0.3492105,   0.56452453,   0.4389236,   -0.59929526, -0.19762468,
+      -0.36868393, -0.13198286,  -0.53800809, -0.22850353};
+
+  std::initializer_list<float> dnn_biases = {
+    0.29177809, -0.98799044, 0.065919638, 0.68781924};
+
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  const int output_sequence_size = output_size * rnn.sequence_len();
+  const int num_examples = 64;
+  for (int k = 0; k < num_examples; k++) {
+    float* batch_start = endtoend_input + k * input_sequence_size;
+    float* batch_end = batch_start + input_sequence_size;
+    rnn.SetInput(0, batch_start, batch_end);
+
+    rnn.Invoke();
+
+    std::vector<float> fw_output = rnn.GetFwOutput();
+    std::vector<float> bw_output = rnn.GetBwOutput();
+    EXPECT_EQ(fw_output.size(), bw_output.size());
+
+    std::transform(fw_output.begin(), fw_output.end(), bw_output.begin(),
+                   fw_output.begin(), std::plus<float>());
+
+    std::vector<float> sequence_result;
+    for (int s = 0; s < rnn.sequence_len(); s++) {
+      const float* rnn_output = fw_output.data() + s * rnn.num_fw_units();
+      std::vector<float> results(dnn_biases);
+      for (int i = 0; i < output_size; i++) {
+        for (int j = 0; j < rnn.num_fw_units(); j++) {
+          results[i] += *(rnn_output + j) * dnn_weights[output_size * j + i];
+        }
+      }
+      sequence_result.insert(sequence_result.end(), results.begin(),
+                             results.end());
+    }
+
+    float* golden_start = golden_endtoend_output + k * output_sequence_size;
+    float* golden_end = golden_start + output_sequence_size;
+
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    EXPECT_THAT(sequence_result, ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 37f499a4d09..7a456474341 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
@@ -38,11 +39,16 @@ namespace ops {
 namespace builtin {
 namespace conv {
 
-// This file has three implementation of Conv.
+// This file has 4 implementation of Conv.
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
-  kNeonOptimized,
+  kMultithreadOptimized,
+  // The kernel uses use CBLAS interface for matrix multiplication.
+  // It's fast when an optimized CBLAS implementation is available (e.g. Apple
+  // Accelerate Framework), and it's slow when falling back to naive
+  // implementation.
+  kCblasOptimized,
 };
 
 struct OpData {
@@ -290,26 +296,34 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
 
-  if (kernel_type == kReference) {
-    reference_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
-  } else {
-    optimized_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+  switch (kernel_type) {
+    case kReference:
+      reference_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
+    case kGenericOptimized:
+    case kMultithreadOptimized:
+    case kCblasOptimized:
+      // There is only one optimized implementation for Quantized Conv.
+      optimized_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
   }
 }
 
@@ -322,31 +336,57 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
 
-  if (kernel_type == kReference) {
-    reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
-                        GetTensorData<float>(filter), GetTensorDims(filter),
-                        GetTensorData<float>(bias), GetTensorDims(bias),
-                        params->stride_width, params->stride_height,
-                        data->padding.width, data->padding.height,
-                        output_activation_min, output_activation_max,
-                        GetTensorData<float>(output), GetTensorDims(output),
-                        GetTensorData<float>(im2col), GetTensorDims(im2col));
-  } else {
-    const float* filter_data;
-    if (data->need_hwcn_weights) {
-      filter_data = GetTensorData<float>(hwcn_weights);
-    } else {
-      filter_data = GetTensorData<float>(filter);
+  switch (kernel_type) {
+    case kReference: {
+      reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kGenericOptimized: {
+      optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kMultithreadOptimized: {
+      const float* filter_data;
+      if (data->need_hwcn_weights) {
+        filter_data = GetTensorData<float>(hwcn_weights);
+      } else {
+        filter_data = GetTensorData<float>(filter);
+      }
+      multithreaded_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input), filter_data,
+          GetTensorDims(filter), GetTensorData<float>(bias),
+          GetTensorDims(bias), params->stride_width, params->stride_height,
+          data->padding.width, data->padding.height, params->padding,
+          output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kCblasOptimized: {
+      cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                      GetTensorData<float>(filter), GetTensorDims(filter),
+                      GetTensorData<float>(bias), GetTensorDims(bias),
+                      params->stride_width, params->stride_height,
+                      data->padding.width, data->padding.height,
+                      output_activation_min, output_activation_max,
+                      GetTensorData<float>(output), GetTensorDims(output),
+                      GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
     }
-
-    multithreaded_ops::Conv(
-        GetTensorData<float>(input), GetTensorDims(input), filter_data,
-        GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
-        params->stride_width, params->stride_height, data->padding.width,
-        data->padding.height, params->padding, output_activation_min,
-        output_activation_max, GetTensorData<float>(output),
-        GetTensorDims(output), GetTensorData<float>(im2col),
-        GetTensorDims(im2col));
   }
 }
 
@@ -407,17 +447,25 @@ TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
   return &r;
 }
 
-TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() {
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
-                                 conv::Eval<conv::kNeonOptimized>};
+                                 conv::Eval<conv::kMultithreadOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
+  static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+                                 conv::Eval<conv::kCblasOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_CONV_2D() {
-#ifdef USE_NEON
-  return Register_CONVOLUTION_NEON_OPT();
+// TODO(ycling): Define a compilation flag and use CBLAS kernel when a
+// fast CBLAS implementatino is available.
+#ifdef TFLITE_USE_CBLAS_CONVOLUTION_KERNEL
+  return Register_CONVOLUTION_CBLAS_OPT();
 #else
-  return Register_CONVOLUTION_GENERIC_OPT();
+  return Register_CONVOLUTION_MULTITHREADED_OPT();
 #endif
 }
 
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 1d0a81c3135..d2393c3c97b 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -15,12 +15,25 @@ limitations under the License.
 #include <cstdarg>
 
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT();
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT();
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAreArray;
@@ -30,9 +43,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
   // TODO(ahentz): Also test different activation types, bias, padding types,
   // stride values.
   BaseConvolutionOpModel(
-      const TensorData& input, const TensorData& filter,
-      const TensorData& output, int stride_width = 2, int stride_height = 2,
-      enum Padding padding = Padding_VALID,
+      TfLiteRegistration* registration, const TensorData& input,
+      const TensorData& filter, const TensorData& output, int stride_width = 2,
+      int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
@@ -62,6 +75,8 @@ class BaseConvolutionOpModel : public SingleOpModel {
                                      stride_height, activation)
                      .Union());
 
+    resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
+                                                    registration);
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
   }
 
@@ -83,12 +98,26 @@ class ConvolutionOpModel : public BaseConvolutionOpModel {
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
-
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 };
 
-TEST(ConvolutionOpTest, SimpleTestFloat32) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
+    {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
+    {"MultithreadedOptimized",
+     ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
+    {"CblasOptimized", ops::builtin::Register_CONVOLUTION_CBLAS_OPT()},
+});
+
+class ConvolutionOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(ConvolutionOpTest, SimpleTestFloat32) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}});
 
@@ -117,8 +146,8 @@ TEST(ConvolutionOpTest, SimpleTestFloat32) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 6, 1}},
+TEST_P(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 6, 1}},
                        {TensorType_FLOAT32, {1, 2, 2, 1}},
                        {TensorType_FLOAT32, {}},
                        /*stride_width=*/3, /*stride_height=*/1);
@@ -139,7 +168,7 @@ TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
                              }));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -150,6 +179,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -192,7 +222,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
                                                178, 187, 234, 261, 121}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -203,6 +233,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -245,7 +276,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
                                                367, 188, 197, 244, 271, 131}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -256,6 +287,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -300,7 +332,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
               ElementsAreArray({0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -311,6 +343,7 @@ TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_VALID;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -366,8 +399,9 @@ class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
 
 // In this tests we set the input and output scales so that the results
 // match exactly the 'non-quantized' version.
-TEST(ConvolutionOpTest, SimpleTestQuantized) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantized) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
                                 {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128});
   m.SetInput({
@@ -405,8 +439,9 @@ TEST(ConvolutionOpTest, SimpleTestQuantized) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
                                 {TensorType_UINT8, {1, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128},
                                 /*stride_width=*/3, /*stride_height=*/1);
@@ -430,6 +465,11 @@ TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
                                  167, 93,   //
                              }));
 }
+
+INSTANTIATE_TEST_CASE_P(
+    ConvolutionOpTest, ConvolutionOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
index dcdc5fffad9..ef2b5422253 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
@@ -123,18 +123,16 @@ TEST(EmbeddingLookupOpTest, SimpleTestSqrtn) {
       [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
   m.Invoke();
 
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear({
-          1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
-          0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
-          6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
-          6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
-          7.20f / std::sqrt(20.0f),
-          7.26f /
-              std::sqrt(
-                  20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 * Row 3 + 4 * Row 0
-      })));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
+                  6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
+                  6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
+                  7.20f / std::sqrt(20.0f),
+                  7.26f / std::sqrt(20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 *
+                                             // Row 3 + 4 * Row 0
+              })));
 }
 
 TEST(EmbeddingLookupOpTest, Indices3DTest) {
diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc
index 658d977b8dc..cdadbeda188 100644
--- a/tensorflow/contrib/lite/kernels/gather_test.cc
+++ b/tensorflow/contrib/lite/kernels/gather_test.cc
@@ -81,10 +81,8 @@ TEST(GatherOpTest, Test0DIndex) {
   m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
   m.SetPositions({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
-  EXPECT_THAT(m.GetOutputShape(),
-              ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(GatherOpTest, Test0DIndexWith0DResult) {
@@ -94,8 +92,7 @@ TEST(GatherOpTest, Test0DIndexWith0DResult) {
   m.SetInputFloat({1.0, 2.0, 3.0});
   m.SetPositions({1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0})));
   EXPECT_TRUE(m.GetOutputShape().empty());
 }
 
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
index cb6038f9009..ba0ed5ce063 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
@@ -116,7 +116,10 @@ TEST(HashtableLookupOpTest, Test2DInput) {
                                  1.0, 1.1,  // 1-st item
                              })));
   EXPECT_THAT(m.GetHit(), ElementsAreArray({
-                              1, 0, 1, 1,
+                              1,
+                              0,
+                              1,
+                              1,
                           }));
 }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 38b032c6de7..404c7d37183 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -124,6 +124,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "darwin_x86_64",
+    values = {
+        "cpu": "darwin_x86_64",
+    },
+)
+
 config_setting(
     name = "freebsd",
     values = {
@@ -154,6 +161,7 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
         ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
@@ -162,6 +170,8 @@ cc_library(
 cc_library(
     name = "optimized",
     hdrs = [
+        "optimized/cblas_conv.h",
+        "optimized/cblas_reference.h",
         "optimized/eigen_spatial_convolutions.h",
         "optimized/eigen_tensor_reduced_instantiations_oss.h",
         "optimized/multithreaded_conv.h",
@@ -232,6 +242,7 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
         ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
@@ -284,6 +295,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_utils",
+    srcs = ["kernel_utils.cc"],
+    hdrs = ["kernel_utils.h"],
+    deps = [
+        ":tensor_utils",
+        "//tensorflow/contrib/lite:builtin_op_data",
+    ],
+)
+
 cc_library(
     name = "tensor_utils",
     srcs = [
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 1d963afb7e1..51426bb1c58 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -27,6 +27,10 @@ limitations under the License.
 #define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
 #endif
 
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false)
+#endif
+
 #ifndef TFLITE_DCHECK_GE
 #define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
 #endif
@@ -52,6 +56,10 @@ limitations under the License.
 #define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort()
 #endif
 
+#ifndef TFLITE_CHECK_NE
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort()
+#endif
+
 #ifndef TFLITE_CHECK_GE
 #define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort()
 #endif
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
new file mode 100644
index 00000000000..510395126ce
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Output = bias
+  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
+                                        output_ptr_batch);
+  // Output += input * input_weights
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
+      output_ptr_batch, /*result_stride=*/1);
+  // Output += recurrent_weights * hidden_state
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
+      batch_size, output_ptr_batch, /*result_stride=*/1);
+  // Output = activation(Output) and update hidden_state
+  tensor_utils::ApplyActivationToVector(
+      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
+  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
+                                        hidden_state_ptr_batch);
+}
+
+}  // namespace kernel_utils
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
new file mode 100644
index 00000000000..9872d4500b8
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+// Performs an RNN batch inference step for inputs specified by input_ptr_batch.
+// The RNN cell is specified by the pointers to its input and recurrent weights,
+// and biases, along with the input size, number of units, activation.
+//
+// The pointers to the hidden state and the output are updated as a result.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many outputs and hidden states.
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+}  // namespace kernel_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
new file mode 100644
index 00000000000..fcb9fac6713
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
@@ -0,0 +1,89 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+
+// The Conv implementation based on CBLAS interface. This is only used on iOS
+// for now, utilizing Apple's Accelerate framework.
+
+// TODO(ycling): Update the BUILD file and integrate with Apple Accelerate
+// Famework when it's available.
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace cblas_ops {
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, float output_activation_min,
+                 float output_activation_max, float* output_data,
+                 const Dims<4>& output_dims, float* im2col_data,
+                 const Dims<4>& im2col_dims) {
+  gemmlowp::ScopedProfilingLabel label("Conv/cblas");
+
+  const float* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::Im2col(input_data, input_dims, stride_width, stride_height,
+                          pad_width, pad_height, filter_height, filter_width, 0,
+                          im2col_data, im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  int m = gemm_input_dims->sizes[1] * gemm_input_dims->sizes[2] *
+          gemm_input_dims->sizes[3];
+  int n = output_dims.sizes[0];
+  int k = gemm_input_dims->sizes[0];
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      bias_data, bias_dims, output_data, output_dims, output_activation_min,
+      output_activation_max);
+}
+
+}  // namespace cblas_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
new file mode 100644
index 00000000000..65789157437
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+// The reference implementation for a small subset of CBLAS interface.
+// This is only used for testing CBLAS implementation, and should never be used
+// in production code.
+
+namespace tflite {
+namespace cblas_ops {
+
+// The following code follows the original CBLAS specification, and it might
+// conflict with the TensorFlow naming convention.
+// TODO(ycling): Find another way to test CBLAS with bazel, without writing
+// a reference implementation by ourselves.
+enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 };
+
+enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 };
+
+// A reference implementation for matrix multiplication.
+// The following code computes, c = a * transponse(b) matrix multiplication
+// with CBLAS, where:
+// * `a` is a matrix with dimensions (m, k).
+// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+// * `c` is a matrix with dimensions (m, n).
+// The naming of variables is aligned with CBLAS specification here.
+void cblas_sgemm(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE trans_a,
+                 const enum CBLAS_TRANSPOSE trans_b, const int m, const int n,
+                 const int k, const float alpha, const float *a,
+                 const int stride_a, const float *b, const int stride_b,
+                 const float beta, float *c, const int stride_c) {
+  TFLITE_DCHECK(order == CblasRowMajor);
+  TFLITE_DCHECK(trans_a == CblasNoTrans);
+  TFLITE_DCHECK(trans_b == CblasTrans);
+  for (int row = 0; row < m; ++row) {
+    for (int col = 0; col < n; ++col) {
+      float value = beta * c[stride_c * row + col];
+      for (int idx = 0; idx < k; ++idx) {
+        value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
+      }
+      c[stride_c * row + col] = value;
+    }
+  }
+}
+
+}  // namespace cblas_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
index 629783d7e58..e0eca2e736b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -36,15 +36,11 @@ inline bool TestCPUFeatureNeon() {
 
 #elif defined USE_NEON || defined __ARM_NEON 
 
-inline bool TestCPUFeatureNeon() {
-  return true;
-}
+inline bool TestCPUFeatureNeon() { return true; }
 
 #else
 
-inline bool TestCPUFeatureNeon() {
-  return false;
-}
+inline bool TestCPUFeatureNeon() { return false; }
 
 #endif
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
index 81796e295d9..e2c87df80bd 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -992,11 +992,11 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
           for (int k = 0; k < 4; k++) {
             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
           }
-            for (int k = 0; k < 4; k++) {
-              acc[k] = vmaxq_f32(
-                  vdupq_n_f32(output_activation_min),
-                  vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
-            }
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vmaxq_f32(
+                vdupq_n_f32(output_activation_min),
+                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
           for (int k = 0; k < 4; k++) {
             vst1q_f32(output_ptr + 4 * k, acc[k]);
           }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index f21fbf532ac..ce3cde76999 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #endif
 
-
 namespace Eigen {
 
 /** SpatialConvolution
@@ -215,13 +214,12 @@ EIGEN_DEVICE_FUNC
   }
   // TODO(yangke): choose() is defined in TensorContraction.h -- consider
   // moving it to somewhere more "common".
-  return
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
-          .reshape(post_contract_dims);
+  return input
+      .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                             row_in_stride, col_in_stride, padding_type)
+      .reshape(pre_contract_dims)
+      .contract(kernel.reshape(kernel_dims), contract_dims)
+      .reshape(post_contract_dims);
 }
 
 }  // end namespace Eigen
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 31bade26f98..4bcf4993e92 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -2370,13 +2370,15 @@ inline int StartIndex(int start, int stride, int dim, bool masked) {
   return masked ? (stride > 0 ? 0 : dim - 1) : start;
 }
 
-inline int StopIndex(int stop, int stride, int dim, bool masked) {
-  return masked ? (stride > 0 ? dim : -1) : stop;
+inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
+                     bool shrink_axis_masked) {
+  return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
+                            : masked ? (stride > 0 ? dim : -1) : stop;
 }
 
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask,
+                         int begin_mask, int end_mask, int shrink_axis_mask,
                          const std::vector<int>& starts,
                          const std::vector<int>& stops,
                          const std::vector<int>& strides, T* output_data,
@@ -2387,19 +2389,23 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   const int start_b =
       StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
   const int stop_b =
-      StopIndex(stops[3], strides[3], input_dims.sizes[3], end_mask & 8);
+      StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
+                end_mask & 8, shrink_axis_mask & 8);
   const int start_h =
       StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
   const int stop_h =
-      StopIndex(stops[2], strides[2], input_dims.sizes[2], end_mask & 4);
+      StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
+                end_mask & 4, shrink_axis_mask & 4);
   const int start_w =
       StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
   const int stop_w =
-      StopIndex(stops[1], strides[1], input_dims.sizes[1], end_mask & 2);
+      StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
+                end_mask & 2, shrink_axis_mask & 2);
   const int start_d =
       StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
   const int stop_d =
-      StopIndex(stops[0], strides[0], input_dims.sizes[0], end_mask & 1);
+      StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
+                end_mask & 1, shrink_axis_mask & 1);
 
   T* out_ptr = output_data;
   for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
@@ -2417,6 +2423,18 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& starts,
+                         const std::vector<int>& stops,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  StridedSlice(input_data, input_dims, begin_mask, end_mask,
+               /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
+               output_dims);
+}
+
 template <typename T>
 inline void Slice(const T* input_data, const Dims<4>& input_dims,
                   const std::vector<int>& begin, const std::vector<int>& size,
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index b0546c00cf9..955e8c5764c 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
 #include <algorithm>
 #include <cmath>
+#include <memory>
+
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 
 namespace tflite {
@@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
   }
 }
 
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
+  return TfLiteIntArrayEqual(input1->dims, input2->dims);
+}
+
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape) {
+  int64_t dims1 = NumDimensions(input1);
+  int64_t dims2 = NumDimensions(input2);
+  int64_t out_dims = std::max(dims1, dims2);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
+  for (int i = 0; i < out_dims; ++i) {
+    int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
+    int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
+    TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
+    shape->data[out_dims - i - 1] = std::max(d1, d2);
+  }
+  *output_shape = shape.release();
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index bfdfba00f51..28f53b9fbbc 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
+inline int64_t NumElements(const TfLiteTensor* t) {
+  int64_t count = 1;
+  for (int i = 0; i < NumDimensions(t); ++i) {
+    count *= SizeOfDimension(t, i);
+  }
+  return count;
+}
+
 inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                             const TfLiteNode* node, int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@@ -57,7 +65,10 @@ inline bool IsDynamicTensor(TfLiteTensor* tensor) {
 
 // Sets tensor to dynamic.
 inline void SetTensorToDynamic(TfLiteTensor* tensor) {
-  tensor->allocation_type = kTfLiteDynamic;
+  if (tensor->allocation_type != kTfLiteDynamic) {
+    tensor->allocation_type = kTfLiteDynamic;
+    tensor->data.raw = nullptr;
+  }
 }
 
 // Calculates the multiplication factor for a quantized convolution (or
@@ -76,6 +87,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                    float* activation_min,
                                    float* activation_max);
 
+// Return true if the given tensors have the same shape.
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
+
+// Calculate the output_shape that is necessary for element-wise operations
+// with broadcasting involving the two input tensors.
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
new file mode 100644
index 00000000000..c65b68970f6
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+void ReportError(TfLiteContext* context, const char* format, ...) {}
+
+class KernelUtilTest : public ::testing::Test {
+ public:
+  KernelUtilTest() {
+    context_.ReportError = ReportError;
+
+    tensor1_.dims = nullptr;
+    tensor2_.dims = nullptr;
+    tensor1_.allocation_type = kTfLiteMmapRo;
+    tensor2_.allocation_type = kTfLiteMmapRo;
+  }
+  ~KernelUtilTest() {
+    TfLiteTensorFree(&tensor1_);
+    TfLiteTensorFree(&tensor2_);
+  }
+
+  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
+    TfLiteTensorFree(tensor);
+    tensor->dims = TfLiteIntArrayCreate(dims.size());
+    int i = 0;
+    for (int d : dims) {
+      tensor->dims->data[i] = d;
+      ++i;
+    }
+  }
+
+  std::vector<int> GetShape(TfLiteIntArray* dims) {
+    std::vector<int> result;
+    for (int i = 0; i < dims->size; ++i) {
+      result.push_back(dims->data[i]);
+    }
+    return result;
+  }
+
+ protected:
+  TfLiteContext context_;
+  TfLiteTensor tensor1_;
+  TfLiteTensor tensor2_;
+};
+
+TEST_F(KernelUtilTest, SameShapeEmpty) {
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {1, 2, 3});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3, 4});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_EQ(output, nullptr);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeOnes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 1});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeScalars) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {});
+  SetShape(&tensor2_, {2});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {3, 1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2, 3, 4});
+  SetShape(&tensor2_, {1, 3, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
+  TfLiteIntArrayFree(output);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index 540e5a364dd..ec1c4020276 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -35,10 +35,12 @@ struct MeanContext {
   MeanContext(TfLiteContext* context, TfLiteNode* node) {
     params = reinterpret_cast<TfLiteMeanParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    axis = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
   TfLiteMeanParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* axis;
   TfLiteTensor* output;
 };
 
@@ -54,45 +56,26 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<int*>(buffer);
 }
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  MeanContext op_context(context, node);
-  int input_num_dims = NumDimensions(op_context.input);
-  int axis_num_dims = op_context.params->num_axis_dimensions;
-
-  // Creates a temp index to iterate through input data.
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
-  TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(2);
-  node->temporaries->data[0] = *scratch_tensor_index;
-  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
-  scratch_tensor->type = kTfLiteInt32;
-  scratch_tensor->allocation_type = kTfLiteArenaRw;
-  TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
-  index_size->data[0] = input_num_dims;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, scratch_tensor, index_size));
-
-  // Creates a temp tensor to store resolved axis given input data.
-  node->temporaries->data[1] = *scratch_tensor_index + 1;
-  TfLiteTensor* axis_tensor = &context->tensors[node->temporaries->data[1]];
-  axis_tensor->type = kTfLiteInt32;
-  axis_tensor->allocation_type = kTfLiteArenaRw;
+// Resizes the temp tensor that stores resolved axis.
+TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context,
+                            TfLiteTensor* resolved_axis) {
   TfLiteIntArray* axis_size = TfLiteIntArrayCreate(1);
-  axis_size->data[0] = op_context.params->num_axis_dimensions;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, axis_tensor, axis_size));
+  axis_size->data[0] = static_cast<int>(NumElements(op_context->axis));
+  return context->ResizeTensor(context, resolved_axis, axis_size);
+}
 
-  // Determines size of output tensor.
-  const TfLiteIntArray* input_dims = op_context.input->dims;
-  const int* axis = op_context.params->axis;
-  if (op_context.params->keep_dims) {
+// Resizes output array based on the input size and resolved axis.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                MeanContext* op_context) {
+  size_t num_axis = NumElements(op_context->axis);
+  const TfLiteIntArray* input_dims = op_context->input->dims;
+  int input_num_dims = NumDimensions(op_context->input);
+  const int* axis = GetTensorData<int>(op_context->axis);
+  if (op_context->params->keep_dims) {
     TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims);
     for (int idx = 0; idx < input_num_dims; ++idx) {
       bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
         if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
           is_axis = true;
           break;
@@ -104,11 +87,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         output_dims->data[idx] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context.output, output_dims);
+    return context->ResizeTensor(context, op_context->output, output_dims);
   } else {
     // Calculates size of reducing axis.
-    int num_reduce_axis = axis_num_dims;
-    for (int i = 0; i < axis_num_dims; ++i) {
+    int num_reduce_axis = num_axis;
+    for (int i = 0; i < num_axis; ++i) {
       int current = axis[i];
       if (current < 0) {
         current += input_num_dims;
@@ -131,7 +114,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     int num_skip_axis = 0;
     for (int idx = 0; idx < input_num_dims; ++idx) {
       bool is_axis = false;
-      for (int axis_idx = 0; axis_idx < axis_num_dims; ++axis_idx) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
         if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
           ++num_skip_axis;
           is_axis = true;
@@ -142,24 +125,76 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
       }
     }
-    return context->ResizeTensor(context, op_context.output, output_dims);
+    return context->ResizeTensor(context, op_context->output, output_dims);
   }
 }
 
+// Initializes temp tensors to store index and resolved axis.
+TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
+                                   MeanContext* op_context) {
+  // Creates a temp index to iterate through input data.
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries->data[0] = *scratch_tensor_index;
+  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  scratch_tensor->type = kTfLiteInt32;
+  scratch_tensor->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
+  index_size->data[0] = NumDimensions(op_context->input);
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, scratch_tensor, index_size));
+
+  // Creates a temp tensor to store resolved axis given input data.
+  node->temporaries->data[1] = *scratch_tensor_index + 1;
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  resolved_axis->type = kTfLiteInt32;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MeanContext op_context(context, node);
+  TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Leaves work to Eval if axis is not constant; else resizes output.
+  if (!IsConstantTensor(op_context.axis)) {
+    SetTensorToDynamic(op_context.output);
+    SetTensorToDynamic(resolved_axis);
+    return kTfLiteOk;
+  }
+  resolved_axis->allocation_type = kTfLiteArenaRw;
+  TF_LITE_ENSURE_OK(context,
+                    ResizeTempAxis(context, &op_context, resolved_axis));
+  return ResizeOutputTensor(context, &op_context);
+}
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   MeanContext op_context(context, node);
+  int num_axis = static_cast<int>(NumElements(op_context.axis));
   TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempAxis(context, &op_context, resolved_axis));
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(resolved_axis->bytes, resolved_axis);
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-#define TF_LITE_MEAN(kernel_type, data_type)                           \
-  kernel_type::Mean<>(                                                 \
-      GetTensorData<data_type>(op_context.input),                      \
-      op_context.input->dims->data, op_context.input->dims->size,      \
-      GetTensorData<data_type>(op_context.output),                     \
-      op_context.output->dims->data, op_context.output->dims->size,    \
-      op_context.params->axis, op_context.params->num_axis_dimensions, \
-      op_context.params->keep_dims, GetTensorData<int>(temp_index),    \
+#define TF_LITE_MEAN(kernel_type, data_type)                        \
+  kernel_type::Mean<>(                                              \
+      GetTensorData<data_type>(op_context.input),                   \
+      op_context.input->dims->data, op_context.input->dims->size,   \
+      GetTensorData<data_type>(op_context.output),                  \
+      op_context.output->dims->data, op_context.output->dims->size, \
+      GetTensorData<int>(op_context.axis), num_axis,                \
+      op_context.params->keep_dims, GetTensorData<int>(temp_index), \
       GetTensorData<int>(resolved_axis))
 
   if (kernel_type == kReference) {
diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc
index 4305c0632f5..c4c53c2ded3 100644
--- a/tensorflow/contrib/lite/kernels/mean_test.cc
+++ b/tensorflow/contrib/lite/kernels/mean_test.cc
@@ -25,58 +25,108 @@ using ::testing::ElementsAreArray;
 
 class BaseMeanOpModel : public SingleOpModel {
  public:
-  BaseMeanOpModel(const TensorData& input, const TensorData& output,
-                  std::initializer_list<int> axis, bool keep_dims) {
-    input_ = AddInput(input);
-    output_ = AddOutput(output);
-    SetBuiltinOp(
-        BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
-        CreateMeanOptions(builder_, builder_.CreateVector<int>(axis), keep_dims)
-            .Union());
-    BuildInterpreter({GetShape(input_)});
-  }
+  void SetAxis(std::initializer_list<int> data) { PopulateTensor(axis_, data); }
 
-  int input() { return input_; }
-
- protected:
-  int input_;
-  int output_;
-};
-
-class FloatMeanOpModel : public BaseMeanOpModel {
- public:
-  using BaseMeanOpModel::BaseMeanOpModel;
-
-  void SetInput(std::initializer_list<float> data) {
+  template <class T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int axis_;
+  int output_;
 };
 
-TEST(FloatMeanOpTest, NotKeepDims) {
+// Model for the tests case where axis is a const tensor.
+class MeanOpConstModel : public BaseMeanOpModel {
+ public:
+  MeanOpConstModel(const TensorData& input, const TensorData& output,
+                   std::initializer_list<int> axis_shape,
+                   std::initializer_list<int> axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+// Model for the tests case where axis is a dynamic tensor.
+class MeanOpDynamicModel : public BaseMeanOpModel {
+ public:
+  MeanOpDynamicModel(const TensorData& input, const TensorData& output,
+                     const TensorData& axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+TEST(ConstMeanOpTest, NotKeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-  FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
-                     {1, 0, -3, -3}, false);
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
+                     {4}, {1, 0, -3, -3}, false);
   m.SetInput(data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13})));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
 }
 
-TEST(FloatMeanOpTest, KeepDims) {
+TEST(ConstMeanOpTest, KeepDims) {
   std::initializer_list<float> data = {
       1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
       13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
-  FloatMeanOpModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
-                     {0, 2}, true);
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
+                     {2}, {0, 2}, true);
   m.SetInput(data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
+}
+
+TEST(DynamicMeanOpTest, NotKeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}},
+                       false);
+  std::initializer_list<int> axis = {1, 0, -3, -3};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
+}
+
+TEST(DynamicMeanOpTest, KeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}},
+                       true);
+  std::initializer_list<int> axis = {0, 2};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 81c73f25231..54575019de4 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
-
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, TfLiteTensor* input1,
-               TfLiteTensor* input2, TfLiteTensor* output) {
+               TfLiteMulParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_MUL(type)                                        \
-  type::Mul(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_MUL(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(reference_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(reference_ops, Mul);
+    }
   } else {
-    TF_LITE_MUL(optimized_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(optimized_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(optimized_ops, Mul);
+    }
   }
 #undef TF_LITE_MUL
 }
 
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, TfLiteTensor* input1,
-                   TfLiteTensor* input2, TfLiteTensor* output) {
+                   TfLiteMulParams* params, const OpData* data,
+                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -98,17 +127,19 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_MUL(type)                                                    \
-  type::BroadcastMul(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
-                     input1_offset, GetTensorData<uint8_t>(input2),          \
-                     GetTensorDims(input2), input2_offset, output_offset,    \
-                     output_multiplier, output_shift, output_activation_min, \
-                     output_activation_max, GetTensorData<uint8_t>(output),  \
-                     GetTensorDims(output));
+#define TF_LITE_MUL(type, opname)                                      \
+  type::opname(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
+               input1_offset, GetTensorData<uint8_t>(input2),          \
+               GetTensorDims(input2), input2_offset, output_offset,    \
+               output_multiplier, output_shift, output_activation_min, \
+               output_activation_max, GetTensorData<uint8_t>(output),  \
+               GetTensorDims(output));
+  // The quantized version of Mul doesn't support activations, so we
+  // always use BroadcastMul.
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    TF_LITE_MUL(reference_ops, BroadcastMul);
   } else {
-    TF_LITE_MUL(optimized_ops);
+    TF_LITE_MUL(optimized_ops, BroadcastMul);
   }
 #undef TF_LITE_MUL
 }
@@ -116,15 +147,17 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalQuantized<kernel_type>(context, node, params, input1, input2, output);
+    EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
+                               output);
   } else {
     context->ReportError(context,
                          "Mul only supports FLOAT32 and quantized UINT8 now.");
@@ -137,19 +170,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration* Register_MUL_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc
index 8838b300c0a..f1a30f82634 100644
--- a/tensorflow/contrib/lite/kernels/mul_test.cc
+++ b/tensorflow/contrib/lite/kernels/mul_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseMulOpModel : public SingleOpModel {
  public:
-  BaseMulOpModel(TensorData input, TensorData output,
+  BaseMulOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions,
                  CreateMulOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ class QuantizedMulOpModel : public BaseMulOpModel {
 
 TEST(FloatMulOpTest, NoActivation) {
   FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -79,9 +81,9 @@ TEST(FloatMulOpTest, NoActivation) {
 }
 
 TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) {
-  FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}},
-                    ActivationFunctionType_RELU_N1_TO_1);
+  FloatMulOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 5});
   m.Invoke();
@@ -94,6 +96,7 @@ TEST(FloatMulOpTest, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -105,8 +108,26 @@ TEST(FloatMulOpTest, VariousInputShapes) {
   }
 }
 
+TEST(FloatMulOpTest, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-0.2, 0.02, 0.07, 0.08, 0.11, 0.2})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedMulOpTest, NoActivation) {
   QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                         {TensorType_UINT8, {}, -1.0, 1.0},
                         ActivationFunctionType_NONE);
   m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
@@ -117,6 +138,32 @@ TEST(QuantizedMulOpTest, NoActivation) {
                                               kQuantizedTolerance)));
 }
 
+// for quantized Mul, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(QuantizedMulOpTest, WithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
index 17166715ca3..cee3ec6197c 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -243,7 +243,6 @@ class LSTMOpModel : public SingleOpModel {
   int n_output_;
 };
 
-
 TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   const int n_batch = 1;
   const int n_input = 2;
@@ -282,7 +281,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-
   lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
                               0.04717243, 0.48944736, -0.38535351,
                               -0.17212132});
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 4003ed10df4..48114e5a406 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -177,9 +177,7 @@ TfLiteRegistration* Register_PAD_GENERIC_OPT() {
   return &r;
 }
 
-TfLiteRegistration* Register_PAD() {
-  return Register_PAD_GENERIC_OPT();
-}
+TfLiteRegistration* Register_PAD() { return Register_PAD_GENERIC_OPT(); }
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index f605deaa5b4..1fb779fd517 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -31,6 +31,7 @@ TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
 TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
@@ -73,6 +74,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+             Register_BIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
              Register_UNIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index 9a419af0238..4a2101f2468 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -36,6 +36,17 @@ constexpr int kInputTensor = 0;
 constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context, TfLiteTensor* input,
+                                TfLiteTensor* size, TfLiteTensor* output) {
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  const int32* size_data = GetTensorData<int32>(size);
+  output_size->data[1] = size_data[0];
+  output_size->data[2] = size_data[1];
+  output_size->data[3] = input->dims->data[3];
+  return context->ResizeTensor(context, output, output_size);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -55,9 +66,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // integers.
   output->type = kTfLiteFloat32;
 
-  // TODO(ahentz): if the input is constant, we can allocate here.
-  output->allocation_type = kTfLiteDynamic;
-  return kTfLiteOk;
+  if (!IsConstantTensor(size)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, input, size, output);
 }
 
 template <KernelType kernel_type>
@@ -66,15 +79,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteTensor* size = GetInput(context, node, kSizeTensor);
 
-  // TODO(ahentz): we only need to do this here if it wasn't done in Eval().
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
-  output_size->data[0] = input->dims->data[0];
-  const int32* size_data = GetTensorData<int32>(size);
-  output_size->data[1] = size_data[0];
-  output_size->data[2] = size_data[1];
-  output_size->data[3] = input->dims->data[3];
-  context->ResizeTensor(context, output, output_size);
-  TfLiteTensorRealloc(output->bytes, output);
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, input, size, output));
+    TfLiteTensorRealloc(output->bytes, output);
+  }
 
   if (output->type == kTfLiteFloat32) {
 #define TF_LITE_RESIZE_BILINEAR(type)                                     \
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 2b1aaf654f8..4e03f3820a5 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -25,14 +25,24 @@ using ::testing::ElementsAreArray;
 
 class ResizeBilinearOpModel : public SingleOpModel {
  public:
-  ResizeBilinearOpModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
-    size_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+  ResizeBilinearOpModel(const TensorData& input,
+                        std::initializer_list<int> size_data = {}) {
+    bool const_size = size_data.size() != 0;
+    input_ = AddInput(input);
+    if (const_size) {
+      size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    } else {
+      size_ = AddInput({TensorType_INT32, {2}});
+    }
+    output_ = AddOutput(TensorType_FLOAT32);  // Always float.
     SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
                  BuiltinOptions_ResizeBilinearOptions,
                  CreateResizeBilinearOptions(builder_).Union());
-    BuildInterpreter({input_shape, {2}});
+    if (const_size) {
+      BuildInterpreter({GetShape(input_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_)});
+    }
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -49,23 +59,33 @@ class ResizeBilinearOpModel : public SingleOpModel {
 };
 
 TEST(ResizeBilinearOpTest, HorizontalResize) {
-  ResizeBilinearOpModel m({1, 1, 2, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
   m.SetInput({3, 6});
   m.SetSize({1, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
 TEST(ResizeBilinearOpTest, VerticalResize) {
-  ResizeBilinearOpModel m({1, 2, 1, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
   m.SetInput({3, 9});
   m.SetSize({3, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
   m.SetInput({
       3, 6,  //
       9, 12  //
@@ -77,10 +97,22 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
                                  7, 9, 10,   //
                                  9, 11, 12,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 5, 6,    //
+                                       7, 9, 10,   //
+                                       9, 11, 12,  //
+                                   })));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
-  ResizeBilinearOpModel m({2, 2, 2, 1});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
   m.SetInput({
       3, 6,   //
       9, 12,  //
@@ -97,10 +129,27 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
                                  8, 12, 14,   //
                                  10, 14, 16,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 5, 6,     //
+                                       7, 9, 10,    //
+                                       9, 11, 12,   //
+                                       4, 8, 10,    //
+                                       8, 12, 14,   //
+                                       10, 14, 16,  //
+                                   })));
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 2});
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
   m.SetInput({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
@@ -112,6 +161,18 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
                                  7, 8, 9, 12, 10, 14,    //
                                  9, 10, 11, 14, 12, 16,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 4, 5, 8, 6, 10,      //
+                                       7, 8, 9, 12, 10, 14,    //
+                                       9, 10, 11, 14, 12, 16,  //
+                                   })));
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index 2e22d0db56a..e2e1873f770 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -33,17 +33,16 @@ enum KernelType {
   kGenericOptimized,
 };
 
-// Inputs specified in the 2nd tensor (block_shape) and 3rd tensor (paddings)
-// are ignored. Only use the `block_shape` and `paddings` specified in params.
-// TODO(nupurgarg): Support inputs as tensors in SpaceToBatchND.
 struct SpaceToBatchNDContext {
   SpaceToBatchNDContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteSpaceToBatchNDParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    paddings = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteSpaceToBatchNDParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* paddings;
   TfLiteTensor* output;
 };
 
@@ -51,32 +50,29 @@ struct SpaceToBatchNDContext {
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(nupurgarg): Support arbitrary dimension in SpaceToBatchND.
 const int kInputDimensionNum = 4;
-const int kOutputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
 const int kSpatialDimensionNum = 2;
-const int kPaddingDimensionNum = 4;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) >= 1 && NumInputs(node) <= 3);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                SpaceToBatchNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int32* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
 
-  SpaceToBatchNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.params->num_spatial_dimensions,
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->paddings),
                     kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  const int* block_shape = op_context.params->block_shape;
-
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(kOutputDimensionNum);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
 
   // Ensures the input height and width (with padding) is a multiple of block
   // shape height and width.
   for (int dim = 0; dim < kSpatialDimensionNum; ++dim) {
-    int final_dim_size =
-        (input_size->data[dim + 1] + op_context.params->before_paddings[dim] +
-         op_context.params->after_paddings[dim]);
+    int final_dim_size = (input_size->data[dim + 1] + paddings_data[dim * 2] +
+                          paddings_data[dim * 2 + 1]);
     TF_LITE_ENSURE_EQ(context, final_dim_size % block_shape[dim], 0);
     output_size->data[dim + 1] = final_dim_size / block_shape[dim];
   }
@@ -88,33 +84,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_size->data[0] = output_batch_size;
   output_size->data[3] = output_channel_size;
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  SpaceToBatchNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.paddings)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   SpaceToBatchNDContext op_context(context, node);
 
-  int block_shape_dims_array[1] = {kSpatialDimensionNum};
-  Dims<4> block_shape_dims = GetTensorDims(block_shape_dims_array, 1);
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
 
-  // Initialize padding array in the format accepted by the kernel code.
-  // TODO(nupurgarg): Make kernel code accept padding array format that is
-  // consistent with Pad operation (i.e. before_paddings and after_paddings).
-  TfLiteIntArray* padding_data = TfLiteIntArrayCreate(kPaddingDimensionNum);
-  padding_data->data[0] = op_context.params->before_paddings[0];
-  padding_data->data[1] = op_context.params->after_paddings[0];
-  padding_data->data[2] = op_context.params->before_paddings[1];
-  padding_data->data[3] = op_context.params->after_paddings[1];
-  int padding_dims_array[1] = {kPaddingDimensionNum};
-  Dims<4> padding_dims = GetTensorDims(padding_dims_array, 1);
-
-#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                          \
-  type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),          \
-                       GetTensorDims(op_context.input),                  \
-                       op_context.params->block_shape, block_shape_dims, \
-                       padding_data->data, padding_dims,                 \
-                       GetTensorData<scalar>(op_context.output),         \
+#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                        \
+  type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<int32_t>(op_context.paddings),    \
+                       GetTensorDims(op_context.paddings),             \
+                       GetTensorData<scalar>(op_context.output),       \
                        GetTensorDims(op_context.output))
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
@@ -151,8 +158,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_BATCH_ND
-
-  TfLiteIntArrayFree(padding_data);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
index 45a6aef73d0..92a4a037d58 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
@@ -26,41 +26,81 @@ using ::testing::ElementsAreArray;
 
 class SpaceToBatchNDOpModel : public SingleOpModel {
  public:
-  SpaceToBatchNDOpModel(std::initializer_list<int> input_shape,
-                        std::initializer_list<int> block_shape,
-                        std::initializer_list<int> before_paddings,
-                        std::initializer_list<int> after_paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
-                 BuiltinOptions_SpaceToBatchNDOptions,
-                 CreateSpaceToBatchNDOptions(
-                     builder_, builder_.CreateVector<int>(block_shape),
-                     builder_.CreateVector<int>(before_paddings),
-                     builder_.CreateVector<int>(after_paddings))
-                     .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetPaddings(std::initializer_list<int> data) {
+    PopulateTensor<int>(paddings_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int block_shape_;
+  int paddings_;
   int output_;
 };
 
+// Tests case where block_shape and paddings are const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, block_shape, paddings);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and paddings are non-const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(paddings);
+//    m.Invoke();
+class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    paddings_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(SpaceToBatchNDOpModel({1, 3, 3, 1}, {2, 2}, {0, 0}, {0, 0}),
+  EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
 }
 
-TEST(SpaceToBatchNDOpTest, SimpleTest) {
-  SpaceToBatchNDOpModel m({1, 4, 4, 1}, {2, 2}, {0, 0}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
@@ -68,8 +108,19 @@ TEST(SpaceToBatchNDOpTest, SimpleTest) {
                                                13, 15, 6, 8, 14, 16}));
 }
 
-TEST(SpaceToBatchNDOpTest, MultipleInputBatches) {
-  SpaceToBatchNDOpModel m({2, 2, 4, 1}, {2, 2}, {0, 0}, {0, 0});
+TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
+  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
@@ -77,8 +128,19 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatches) {
                                                13, 15, 6, 8, 14, 16}));
 }
 
-TEST(SpaceToBatchNDOpTest, SimplePadding) {
-  SpaceToBatchNDOpModel m({1, 5, 2, 1}, {3, 2}, {1, 2}, {0, 0});
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
@@ -88,8 +150,21 @@ TEST(SpaceToBatchNDOpTest, SimplePadding) {
                              }));
 }
 
-TEST(SpaceToBatchNDOpTest, ComplexPadding) {
-  SpaceToBatchNDOpModel m({1, 4, 2, 1}, {3, 2}, {1, 2}, {1, 4});
+TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7,
+                                 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
@@ -100,6 +175,20 @@ TEST(SpaceToBatchNDOpTest, ComplexPadding) {
                              }));
 }
 
+TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 1, 2, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
+                                 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
+                             }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index 91ba4a9b785..c4ffdf79d3a 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -57,65 +57,6 @@ struct StridedSliceContext {
   int dims;
 };
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  StridedSliceContext op_context(context, node);
-
-  // Ensure validity of input tensor and its dimension
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  // Only INT32 begin/end/strides are supported
-  // TODO(soroosh) add support for INT64
-  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
-  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
-                     "StridedSlice op only supports 1D-4D input arrays.");
-
-  // TODO(soroosh): add the following missing functionalities
-  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
-                     "ellipsis_mask is not implemented yet.");
-  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
-                     "new_axis_mask is not implemented yet.");
-  TF_LITE_ENSURE_MSG(context, op_context.params->shrink_axis_mask == 0,
-                     "shrink_axis_mask is not implemented yet.");
-
-  // TODO(soroosh): optimize for constant tensors to do allocation in Prepare
-  op_context.output->allocation_type = kTfLiteDynamic;
-  return kTfLiteOk;
-}  // namespace strided_slice
-
-// TODO(soroosh): consolidate with BytesRequired in interpreter.h
-TfLiteStatus BytesRequired(TfLiteContext* context, TfLiteType type,
-                           const int* dims, int dims_size, size_t* bytes) {
-  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
-  // MultiplyWithoutOverflow.
-  TF_LITE_ENSURE(context, bytes != nullptr);
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    default:
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
 // Reverse order of bits in the mask to match the expected order in kernel
 inline int ReverseMaskBits(int mask, int num_dimensions) {
   int out = 0;
@@ -146,40 +87,111 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
                           std::min(std::max(index, -dim), dim - 1), dim));
 }
 
+inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->begin_mask & (1 << idx)
+             ? pos_stride ? 0 : dim - 1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->begin)[idx], dim,
+                            pos_stride);
+}
+
+inline int32_t GetEndValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->end_mask & (1 << idx)
+             ? pos_stride ? dim : -1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->end)[idx], dim,
+                            pos_stride);
+}
+
+// Processes the indexing tensors (begin, end and strides) to resize the
+// output tensor. This function is callable from both Prepare() and Eval() as
+// long as the caller ensures the indexing tensors are present.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                StridedSliceContext* op_context) {
+  std::vector<int> output_shape_vector;
+
+  for (int idx = op_context->dims - 1; idx >= 0; --idx) {
+    int32_t stride = GetTensorData<int32_t>(op_context->strides)[idx];
+    TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
+
+    int32_t begin = GetBeginValueAtIndex(op_context, idx);
+    int32_t end = GetEndValueAtIndex(op_context, idx);
+
+    // This is valid for both positive and negative strides
+    int32_t dim_shape = ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!(op_context->params->shrink_axis_mask & (1 << idx))) {
+      output_shape_vector.push_back(dim_shape);
+    }
+  }
+
+  TfLiteIntArray* output_shape =
+      TfLiteIntArrayCreate(output_shape_vector.size());
+
+  std::reverse_copy(output_shape_vector.begin(), output_shape_vector.end(),
+                    output_shape->data);
+
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, op_context->output, output_shape));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  StridedSliceContext op_context(context, node);
+
+  // Ensure validity of input tensor and its dimension
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  // Only INT32 begin/end/strides are supported
+  // TODO(soroosh) add support for INT64
+  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
+  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
+                     "StridedSlice op only supports 1D-4D input arrays.");
+
+  // TODO(soroosh): add the following missing functionalities
+  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
+                     "ellipsis_mask is not implemented yet.");
+  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
+                     "new_axis_mask is not implemented yet.");
+
+  // Postpone allocation of output if any of the indexing tensors is not
+  // constant
+  if (!(IsConstantTensor(op_context.begin) &&
+        IsConstantTensor(op_context.end) &&
+        IsConstantTensor(op_context.strides))) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   StridedSliceContext op_context(context, node);
 
-  std::vector<int> starts;
-  std::vector<int> stops;
-  std::vector<int> strides;
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
+  std::vector<int32_t> starts;
+  std::vector<int32_t> stops;
+  std::vector<int32_t> strides;
 
-  // Determine size of output tensor and map indices
-  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(op_context.dims);
   for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    int dim = op_context.input->dims->data[idx];
-    int32_t stride = GetTensorData<int32_t>(op_context.strides)[idx];
-    TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
-    bool pos_stride = stride > 0;
-
-    int32_t begin =
-        op_context.params->begin_mask & (1 << idx)
-            ? pos_stride ? 0 : dim - 1
-            : ClampedIndex(GetTensorData<int32_t>(op_context.begin)[idx], dim,
-                           pos_stride);
-    int32_t end =
-        op_context.params->end_mask & (1 << idx)
-            ? pos_stride ? dim : -1
-            : ClampedIndex(GetTensorData<int32_t>(op_context.end)[idx], dim,
-                           pos_stride);
-
-    // This is valid for both positive and negative strides
-    output_shape->data[idx] = ceil((end - begin) / static_cast<float>(stride));
-    output_shape->data[idx] =
-        output_shape->data[idx] < 0 ? 0 : output_shape->data[idx];
-    starts.emplace_back(begin);
-    stops.emplace_back(end);
-    strides.emplace_back(stride);
+    starts.emplace_back(GetBeginValueAtIndex(&op_context, idx));
+    stops.emplace_back(GetEndValueAtIndex(&op_context, idx));
+    strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]);
   }
 
   for (int i = op_context.dims; i < kMaxDim; i++) {
@@ -188,27 +200,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     strides.emplace_back(1);
   }
 
-  TF_LITE_ENSURE_STATUS(
-      context->ResizeTensor(context, op_context.output, output_shape));
-
-  size_t required_bytes;
-  TF_LITE_ENSURE_OK(
-      context,
-      BytesRequired(context, op_context.output->type, output_shape->data,
-                    output_shape->size, &required_bytes));
-  TfLiteTensorRealloc(required_bytes, op_context.output);
-
   op_context.params->begin_mask =
       ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
   op_context.params->end_mask =
       ReverseMaskBits(op_context.params->end_mask, op_context.dims);
+  op_context.params->shrink_axis_mask =
+      ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims);
 
-#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                 \
-  kernel_type::StridedSlice(                                          \
-      GetTensorData<data_type>(op_context.input),                     \
-      GetTensorDims(op_context.input), op_context.params->begin_mask, \
-      op_context.params->end_mask, starts, stops, strides,            \
-      GetTensorData<data_type>(op_context.output),                    \
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                      \
+  kernel_type::StridedSlice(                                               \
+      GetTensorData<data_type>(op_context.input),                          \
+      GetTensorDims(op_context.input), op_context.params->begin_mask,      \
+      op_context.params->end_mask, op_context.params->shrink_axis_mask,    \
+      starts, stops, strides, GetTensorData<data_type>(op_context.output), \
       GetTensorDims(op_context.output))
 
   switch (op_context.input->type) {
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index cd4a364682c..5cac04b3836 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+using ::int32;
 using ::testing::ElementsAreArray;
 
 class StridedSliceOpModel : public SingleOpModel {
@@ -79,8 +80,6 @@ TEST(StridedSliceOpTest, UnssupportedArgs) {
                "ellipsis_mask is not implemented yet.");
   EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 0, 1),
-               "shrink_axis_mask is not implemented yet.");
 }
 
 TEST(StridedSliceOpTest, In1D) {
@@ -213,6 +212,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
 }
+
 TEST(StridedSliceOpTest, In1D_NegStride) {
   StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
@@ -234,6 +234,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
 }
+
 TEST(StridedSliceOpTest, In1D_OddLenStride2) {
   StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
@@ -255,6 +256,7 @@ TEST(StridedSliceOpTest, In2D_Identity) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
+
 TEST(StridedSliceOpTest, In2D) {
   StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
@@ -320,6 +322,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4}));
 }
+
 TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
   StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
@@ -354,6 +357,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) {
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}));
 }
+
 TEST(StridedSliceOpTest, In3D_Strided2) {
   StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
@@ -365,6 +369,159 @@ TEST(StridedSliceOpTest, In3D_Strided2) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5}));
 }
 
+TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({2});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-2});
+  m.SetEnd({-3});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 7, 8}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 7}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 72f705fe424..c69755447d5 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
index 4de2ceaf053..0f166dc69b9 100644
--- a/tensorflow/contrib/lite/kernels/svdf_test.cc
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite SVDF op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 3a58e7ec321..6f56aa6bf38 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -172,11 +172,14 @@ void SingleOpModel::BuildInterpreter(
 
   auto* model = GetModel(builder_.GetBufferPointer());
 
-  ops::builtin::BuiltinOpResolver builtins;
-  for (const auto& reg : custom_registrations_) {
-    builtins.AddCustom(reg.first.data(), reg.second());
+  if (!resolver_) {
+    auto resolver = new ops::builtin::BuiltinOpResolver();
+    for (const auto& reg : custom_registrations_) {
+      resolver->AddCustom(reg.first.data(), reg.second());
+    }
+    resolver_ = std::unique_ptr<OpResolver>(resolver);
   }
-  InterpreterBuilder(model, builtins)(&interpreter_);
+  InterpreterBuilder(model, *resolver_)(&interpreter_);
 
   CHECK(interpreter_ != nullptr);
 
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index cc445299ff9..7d476ba1eaf 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -85,6 +85,23 @@ struct TensorData {
   int32_t zero_point;
 };
 
+class SingleOpResolver : public OpResolver {
+ public:
+  SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
+      : op_(op), registration_(registration) {}
+  TfLiteRegistration* FindOp(BuiltinOperator op) const override {
+    if (op == op_) {
+      return registration_;
+    }
+    return nullptr;
+  }
+  TfLiteRegistration* FindOp(const char* op) const override { return nullptr; }
+
+ private:
+  const BuiltinOperator op_;
+  TfLiteRegistration* registration_;
+};
+
 class SingleOpModel {
  public:
   SingleOpModel() {}
@@ -178,11 +195,16 @@ class SingleOpModel {
     return result;
   }
 
+  void SetResolver(std::unique_ptr<OpResolver> resolver) {
+    resolver_ = std::move(resolver);
+  }
+
  protected:
   int32_t GetTensorSize(int index) const;
 
   flatbuffers::FlatBufferBuilder builder_;
   std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<OpResolver> resolver_;
 
  private:
   int AddTensor(TensorData t, std::initializer_list<int> data);
@@ -197,6 +219,36 @@ class SingleOpModel {
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
 };
 
+// Base class for single op unit tests.
+// The tests are parameterized to test multiple kernels for a single op.
+// The parameters are strings like "optimized" and "reference" to have better
+// readability in test reports.
+//
+// To use this class:
+// * Define a constant map from strings to TfLiteRegistration.
+// * Implement a test class that inherits SingleOpTest.
+// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
+//   function.
+// * Call GetRegistration to get the TfLiteRegistration to be used before
+//   building the interpreter.
+class SingleOpTest : public ::testing::TestWithParam<string> {
+ public:
+  static std::vector<string> GetKernelTags(
+      const std::map<string, TfLiteRegistration*>& kernel_map) {
+    std::vector<string> tags;
+    for (auto it : kernel_map) {
+      tags.push_back(it.first);
+    }
+    return tags;
+  }
+
+ protected:
+  virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
+  TfLiteRegistration* GetRegistration() {
+    return GetKernelMap().at(GetParam());
+  }
+};
+
 // Strings have a special implementation that is in test_util.cc
 template <>
 std::vector<string> SingleOpModel::ExtractVector(int index);
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc
index 75d8136b6a2..093814bc449 100644
--- a/tensorflow/contrib/lite/kernels/transpose.cc
+++ b/tensorflow/contrib/lite/kernels/transpose.cc
@@ -31,60 +31,78 @@ enum KernelType {
   kReference,
 };
 
-// TODO(nupurgarg): Permutation arrays represented as a tensor are ignored. Only
-// use the `perm` specified in `params`.
 struct TransposeContext {
   TransposeContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteTransposeParams*>(node->builtin_data);
     input = GetInput(context, node, 0);
+    perm = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTransposeParams* params;
   TfLiteTensor* input;
+  TfLiteTensor* perm;
   TfLiteTensor* output;
 };
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                TransposeContext* op_context) {
+  int dims = NumDimensions(op_context->input);
+  const int* perm_data = GetTensorData<int32_t>(op_context->perm);
 
-  TransposeContext op_context(context, node);
-  int dims = NumDimensions(op_context.input);
-
-  // Ensure validity of input tensor and permutation array.
-  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
-  TF_LITE_ENSURE_EQ(context, dims, op_context.params->num_dimensions);
-  TF_LITE_ENSURE_MSG(context, dims <= 4,
-                     "Transpose op only supports 1D-4D input arrays.");
+  // Ensure validity of the permutations tensor as a 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->perm), 1);
+  TF_LITE_ENSURE_EQ(context, op_context->perm->dims->data[0], dims);
   for (int idx = 0; idx < dims; ++idx) {
-    TF_LITE_ENSURE_MSG(context,
-                       op_context.params->perm[idx] >= 0 &&
-                           op_context.params->perm[idx] < dims,
+    TF_LITE_ENSURE_MSG(context, (perm_data[idx] >= 0 && perm_data[idx] < dims),
                        "Transpose op permutations array is out of bounds.");
   }
 
   // Determine size of output tensor.
-  const TfLiteIntArray* input_size = op_context.input->dims;
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(dims);
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
   for (int idx = 0; idx < dims; ++idx) {
-    output_size->data[idx] = input_size->data[op_context.params->perm[idx]];
+    output_size->data[idx] = input_size->data[perm_data[idx]];
   }
 
-  return context->ResizeTensor(context, op_context.output, output_size);
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TransposeContext op_context(context, node);
+
+  // Ensure validity of input tensor.
+  TF_LITE_ENSURE_MSG(context, NumDimensions(op_context.input) <= 4,
+                     "Transpose op only supports 1D-4D input arrays.");
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.perm)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
 }
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TransposeContext op_context(context, node);
 
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TfLiteTensorRealloc(op_context.output->bytes, op_context.output);
+  }
+
   // Reverse the permuted axes and convert to 4D due to the way Dims are
   // constructed in GetTensorDims.
+  const int* perm_data = GetTensorData<int32_t>(op_context.perm);
+  const int size = op_context.perm->dims->data[0];
   const int kOutputDimensionNum = 4;
   int reversed_perm[kOutputDimensionNum];
-  int size = op_context.params->num_dimensions;
+
   for (int output_k = 0, input_k = size - 1; output_k < size;
        ++output_k, --input_k) {
-    reversed_perm[output_k] = size - op_context.params->perm[input_k] - 1;
+    reversed_perm[output_k] = size - perm_data[input_k] - 1;
   }
   for (int k = size; k < kOutputDimensionNum; ++k) {
     reversed_perm[k] = k;
diff --git a/tensorflow/contrib/lite/kernels/transpose_test.cc b/tensorflow/contrib/lite/kernels/transpose_test.cc
index 7f5832cd5fa..337bc144b96 100644
--- a/tensorflow/contrib/lite/kernels/transpose_test.cc
+++ b/tensorflow/contrib/lite/kernels/transpose_test.cc
@@ -127,59 +127,110 @@ TEST(TransposeTest, TestRefOps4D) {
 
 class TransposeOpModel : public SingleOpModel {
  public:
-  TransposeOpModel(std::initializer_list<int> input_shape,
-                   std::initializer_list<int> perm) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(
-        BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
-        CreateTransposeOptions(builder_, builder_.CreateVector<int>(perm))
-            .Union());
-    BuildInterpreter({input_shape});
-  }
-
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetPerm(std::initializer_list<int> data) {
+    PopulateTensor<int>(perm_, data);
+  }
+
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
- private:
+ protected:
   int input_;
+  int perm_;
   int output_;
 };
 
+// Tests case where perm is a const tensor.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, perm_shape, perm_data);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class TransposeOpConstModel : public TransposeOpModel {
+ public:
+  TransposeOpConstModel(std::initializer_list<int> input_shape,
+                        std::initializer_list<int> perm_shape,
+                        std::initializer_list<int> perm) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddConstInput(TensorType_INT32, perm, perm_shape);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where perm is a non-const tensor.
+//
+// Example usage is as follows:
+//    TransposeOpDynamicModel m(input_shape, perm_shape);
+//    m.SetInput(input_data);
+//    m.SetPerm(perm_data);
+//    m.Invoke();
+class TransposeOpDynamicModel : public TransposeOpModel {
+ public:
+  TransposeOpDynamicModel(std::initializer_list<int> input_shape,
+                          std::initializer_list<int> perm_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape, perm_shape});
+  }
+};
+
 TEST(TransposeTest, TestUnequalPermSize) {
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {2, 2}),
-               "dims != op_context.params->num_dimensions");
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
 }
 
 TEST(TransposeTest, TestPermOutOfBounds) {
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, -1, -2, -3}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, -1, -2, -3}),
                "Transpose op permutations array is out of bounds.");
-  EXPECT_DEATH(TransposeOpModel({1, 3, 3, 1}, {0, 1, 2, 4}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}),
                "Transpose op permutations array is out of bounds.");
 }
 
-TEST(TransposeTest, Test1DInputTensor) {
-  TransposeOpModel m({3}, {0});
+TEST(TransposeTest, Test1DInputConstTensor) {
+  TransposeOpConstModel m({3}, {1}, {0});
   m.SetInput({1, 2, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
 }
 
-TEST(TransposeTest, Test2DInputTensor) {
-  TransposeOpModel m({3, 2}, {1, 0});
+TEST(TransposeTest, Test1DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3}, {1});
+  m.SetInput({1, 2, 3});
+  m.SetPerm({0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(TransposeTest, Test2DInputConstTensor) {
+  TransposeOpConstModel m({3, 2}, {2}, {1, 0});
   m.SetInput({0, 1, 2, 3, 4, 5});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
 }
 
-TEST(TransposeTest, Test3DInputTensor) {
-  TransposeOpModel m({2, 3, 4}, {2, 0, 1});
+TEST(TransposeTest, Test2DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3, 2}, {2});
+  m.SetInput({0, 1, 2, 3, 4, 5});
+  m.SetPerm({1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
+}
+
+TEST(TransposeTest, Test3DInputConstTensor) {
+  TransposeOpConstModel m({2, 3, 4}, {3}, {2, 0, 1});
   m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
               12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
   m.Invoke();
@@ -189,29 +240,50 @@ TEST(TransposeTest, Test3DInputTensor) {
                                 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 
+TEST(TransposeTest, Test3DInputDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4}, {3});
+  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.SetPerm({2, 0, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                                2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
+}
+
 TEST(TransposeTest, Test5DInputTensor) {
-  EXPECT_DEATH(TransposeOpModel({1, 2, 3, 4, 5}, {0, 1, 2, 3, 4}),
+  EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}),
                "Transpose op only supports 1D-4D input arrays.");
 }
 
-TEST(TransposeTest, SimpleTestNoReorder) {
-  TransposeOpModel m({1, 2, 3, 1}, {0, 1, 2, 3});
+TEST(TransposeTest, SimpleTestNoReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
 
-TEST(TransposeTest, SimpleTestWithReorder) {
-  TransposeOpModel m({1, 2, 3, 1}, {2, 1, 3, 0});
+TEST(TransposeTest, SimpleTestNoReorderDynamicTensor) {
+  TransposeOpDynamicModel m({1, 2, 3, 1}, {4});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPerm({0, 1, 2, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(TransposeTest, SimpleTestWithReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {2, 1, 3, 0});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2, 1, 1}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
 }
 
-TEST(TransposeTest, ComplexTestWithReorder) {
-  TransposeOpModel m({2, 3, 4, 5}, {2, 0, 1, 3});
+TEST(TransposeTest, ComplexTestWithReorderConstTensor) {
+  TransposeOpConstModel m({2, 3, 4, 5}, {4}, {2, 0, 1, 3});
   m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
               12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
               24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
@@ -237,6 +309,34 @@ TEST(TransposeTest, ComplexTestWithReorder) {
   EXPECT_THAT(m.GetOutput(), result);
 }
 
+TEST(TransposeTest, ComplexTestWithReorderDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4, 5}, {4});
+  m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
+              12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+              24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+              36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+              48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+              60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+              72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+              84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+              96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+              108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119});
+  m.SetPerm({2, 0, 1, 3});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5}));
+  auto result = ElementsAreArray(
+      {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+       60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+       5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+       65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+       10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+       70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+       15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
+  EXPECT_THAT(m.GetOutput(), result);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index f5f1ec2cf3f..ac00c37b67d 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -15,14 +15,15 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -82,48 +83,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_size_array->data[0] = (time_major) ? max_time : batch_size;
   output_size_array->data[1] = (time_major) ? batch_size : max_time;
   output_size_array->data[2] = num_units;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
-                                                   output_size_array));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
 
   return kTfLiteOk;
 }
 
-namespace {
-void RnnStep(const float* input_ptr_batch, const float* input_weights_ptr,
-             const float* recurrent_weights_ptr, const float* bias_ptr,
-             int input_size, int num_units, int input_weights_stride,
-             int recurrent_weights_stride, TfLiteFusedActivation activation,
-             float* hidden_state_ptr_batch, float* output_ptr_batch) {
-  // Output = bias
-  for (int o = 0; o < num_units; o++) {
-    output_ptr_batch[o] = bias_ptr[o];
-  }
-
-  // Output += input * input_weights
-  for (int o = 0; o < num_units; o++) {
-    for (int i = 0; i < input_size; i++) {
-      output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
-    }
-    input_weights_ptr += input_weights_stride;
-  }
-
-  // Output += recurrent_weights * hidden_state
-  for (int o = 0; o < num_units; o++) {
-    for (int h = 0; h < num_units; h++) {
-      output_ptr_batch[o] +=
-          hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
-    }
-    recurrent_weights_ptr += recurrent_weights_stride;
-  }
-
-  // Output = activation(Output) and update hidden_state
-  for (int o = 0; o < num_units; o++) {
-    output_ptr_batch[o] = (ActivationFunctor(activation))(output_ptr_batch[o]);
-    hidden_state_ptr_batch[o] = output_ptr_batch[o];
-  }
-}
-}  // namespace
-
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
 
@@ -147,30 +112,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       (time_major) ? input->dims->data[0] : input->dims->data[1];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[2];
-  const int input_weights_stride = input_weights->dims->data[1];
-  const int recurrent_weights_stride = recurrent_weights->dims->data[1];
 
   // Initialize input_weights and recurrent_weights.
   const float* input_weights_ptr = input_weights->data.f;
   const float* recurrent_weights_ptr = recurrent_weights->data.f;
 
   if (time_major) {
-    // Unroll the sequence
+    // Initialize the pointer to hidden state.
+    float* hidden_state_ptr_batch = hidden_state->data.f;
+    // Unroll the sequence and use batch batch operations for efficiency.
     for (int s = 0; s < max_time; s++) {
-      for (int b = 0; b < batch_size; b++) {
-        // Initialize the pointer to hidden state.
-        float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
-        // Initialize the pointer to input and output.
-        const float* input_ptr_batch =
-            input->data.f + s * input_size * batch_size + b * input_size;
-        float* output_ptr_batch =
-            output->data.f + s * num_units * batch_size + b * num_units;
+      // Initialize the pointer to input and output.
+      const float* input_ptr_batch =
+          input->data.f + s * input_size * batch_size;
+      float* output_ptr_batch = output->data.f + s * num_units * batch_size;
 
-        RnnStep(input_ptr_batch, input_weights_ptr, recurrent_weights_ptr,
-                bias_ptr, input_size, num_units, input_weights_stride,
-                recurrent_weights_stride, params->activation,
-                hidden_state_ptr_batch, output_ptr_batch);
-      }
+      kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
+                                 recurrent_weights_ptr, bias_ptr, input_size,
+                                 num_units, batch_size, params->activation,
+                                 hidden_state_ptr_batch, output_ptr_batch);
     }
   } else {
     // For each batch
@@ -184,10 +144,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         float* output_ptr_batch =
             output->data.f + b * num_units * max_time + s * num_units;
 
-        RnnStep(input_ptr_batch, input_weights_ptr, recurrent_weights_ptr,
-                bias_ptr, input_size, num_units, input_weights_stride,
-                recurrent_weights_stride, params->activation,
-                hidden_state_ptr_batch, output_ptr_batch);
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
+            input_size, num_units, /*batch_size=*/1, params->activation,
+            hidden_state_ptr_batch, output_ptr_batch);
       }
     }
   }
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
index 82c680ec3d8..7e32969763b 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Sequential RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -120,8 +120,7 @@ static float rnn_golden_output[] = {
 
     0.415153,   0.210318,   0,         0,         0,         0,
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
-    0.628881,   3.58099,    1.49974,   0
-};
+    0.628881,   3.58099,    1.49974,   0};
 
 class UnidirectionalRNNOpModel : public SingleOpModel {
  public:
diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
new file mode 100755
index 00000000000..b58ae266017
--- /dev/null
+++ b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
@@ -0,0 +1,81 @@
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+echo "Starting"
+TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
+
+TMP_DIR=$(mktemp -d)
+echo "Package dir: " $TMP_DIR
+FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
+FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework
+FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
+
+echo "Creating target Headers directories"
+mkdir -p $FW_DIR_TFLITE_HDRS
+
+echo "Headers, populating: TensorFlow Lite"
+cd $TFLITE_DIR/../../..
+
+find tensorflow/contrib/lite -name '*.h' \
+    -not -path 'tensorflow/contrib/lite/downloads/*' \
+    -not -path 'tensorflow/contrib/lite/examples/*' \
+    -not -path 'tensorflow/contrib/lite/gen/*' \
+    -not -path 'tensorflow/contrib/lite/toco/*' \
+    -not -path 'tensorflow/contrib/lite/nnapi/*' \
+    -not -path 'tensorflow/contrib/lite/java/*' \
+    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: Flatbuffer"
+cd $TFLITE_DIR/downloads/flatbuffers/include/
+find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+cd $TFLITE_DIR/../../..
+echo "Generate master LICENSE file and copy to target"
+bazel build //tensorflow/tools/lib_package:clicenses_generate
+cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+   $FW_DIR_TFLITE
+
+echo "Copying static libraries"
+cp $TFLITE_DIR/gen/lib/libtensorflow-lite.a \
+   $FW_DIR_TFLITE/tensorflow_lite
+
+# This is required, otherwise they interfere with the documentation of the
+# pod at cocoapods.org.
+echo "Remove all README files"
+cd $FW_DIR_TFLITE_HDRS
+find . -type f -name README\* -exec rm -f {} \;
+find . -type f -name readme\* -exec rm -f {} \;
+
+TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
+echo "Moving results to target: " $TARGET_GEN_LOCATION
+cd $FW_DIR
+zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store
+rm -rf $TARGET_GEN_LOCATION
+mkdir -p $TARGET_GEN_LOCATION
+cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION
+
+echo "Cleaning up"
+rm -rf $TMP_DIR
+
+echo "Finished"
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 415d984ad8c..b36bfcef843 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -328,6 +328,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
     case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: {
       TfLiteSequenceRNNParams* params = MallocPOD<TfLiteSequenceRNNParams>();
       if (auto* sequence_rnn_params =
@@ -515,62 +516,18 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       break;
     }
     case BuiltinOperator_SPACE_TO_BATCH_ND: {
-      auto* params = MallocPOD<TfLiteSpaceToBatchNDParams>();
-      if (auto* schema_params =
-              op->builtin_options_as_SpaceToBatchNDOptions()) {
-        const auto& block_shape = schema_params->block_shape();
-        FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape,
-                                   params->block_shape, error_reporter);
-        const auto& before_paddings = schema_params->before_paddings();
-        FlatBufferIntVectorToArray(sizeof(params->before_paddings),
-                                   before_paddings, params->before_paddings,
-                                   error_reporter);
-        const auto& after_paddings = schema_params->after_paddings();
-        FlatBufferIntVectorToArray(sizeof(params->after_paddings),
-                                   after_paddings, params->after_paddings,
-                                   error_reporter);
-        params->num_spatial_dimensions = block_shape->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BATCH_TO_SPACE_ND: {
-      auto* params = MallocPOD<TfLiteBatchToSpaceNDParams>();
-      if (auto* schema_params =
-              op->builtin_options_as_BatchToSpaceNDOptions()) {
-        const auto& block_shape = schema_params->block_shape();
-        FlatBufferIntVectorToArray(sizeof(params->block_shape), block_shape,
-                                   params->block_shape, error_reporter);
-        const auto& before_crops = schema_params->before_crops();
-        FlatBufferIntVectorToArray(sizeof(params->before_crops), before_crops,
-                                   params->before_crops, error_reporter);
-        const auto& after_crops = schema_params->after_crops();
-        FlatBufferIntVectorToArray(sizeof(params->after_crops), after_crops,
-                                   params->after_crops, error_reporter);
-        params->num_spatial_dimensions = block_shape->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_TRANSPOSE: {
-      auto* params = MallocPOD<TfLiteTransposeParams>();
-      if (auto* schema_params = op->builtin_options_as_TransposeOptions()) {
-        const auto& perm = schema_params->perm();
-        FlatBufferIntVectorToArray(sizeof(params->perm), perm, params->perm,
-                                   error_reporter);
-        params->num_dimensions = perm->Length();
-      }
-      builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MEAN: {
       auto* params = MallocPOD<TfLiteMeanParams>();
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
-        const auto& axis = schema_params->axis();
-        FlatBufferIntVectorToArray(sizeof(params->axis), axis, params->axis,
-                                   error_reporter);
         params->keep_dims = schema_params->keep_dims();
-        params->num_axis_dimensions = axis->Length();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index 7019c29959f..76032771af2 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -1571,7 +1571,7 @@ inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
 }
 
 /**
- * Specfifies which operands will be the model's inputs and outputs.
+ * Specifies which operands will be the model's inputs and outputs.
  *
  * An operand cannot be used for both input and output. Doing so will
  * return an error.
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index d5b9319407a..da9ceec2f14 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -319,6 +319,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_SVDF:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
       case tflite::BuiltinOperator_RNN:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 3d6a3ec0fd4..2d8c49b7d7a 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -13,6 +13,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":op_hint",
         "//tensorflow/contrib/lite/toco:model_flags_proto_py",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
         "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
@@ -20,6 +21,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "op_hint",
+    srcs = ["op_hint.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
@@ -27,6 +39,7 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":lite",
+        ":op_hint",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 3c369774bed..5d2f2165376 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -18,16 +18,21 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@toco_convert
 @@toco_convert_protos
+@@OpHint
+@@convert_op_hints_to_stubs
 
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import os
 import subprocess
 import tempfile
 
+# pylint: disable=unused-import
+from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.contrib.lite.python.op_hint import OpHint
+# pylint: enable=unused-import
 from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 7d55f3fe6fe..b8b4510188b 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -18,10 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
+from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -35,7 +39,8 @@ class LiteTest(test_util.TensorFlowTestCase):
     # Try running on valid graph
     result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
     self.assertTrue(result)
-    # TODO(aselle): remove tests that fail.
+    # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
+    # all the time).
     # Try running on identity graph (known fail)
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
     #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
@@ -51,5 +56,116 @@ class LiteTest(test_util.TensorFlowTestCase):
                                quantized_input_stats=[(0., 1.)])
     self.assertTrue(result)
 
+
+class LiteTestOpHint(test_util.TensorFlowTestCase):
+  """Test the hint to stub functionality."""
+
+  def _getGraphOpTypes(self, graphdef, output_nodes):
+    """Returns used op types in `graphdef` reachable from `output_nodes`.
+
+    This is used to check that after the stub transformation the expected
+    nodes are there. Typically use this with self.assertCountEqual(...).
+
+    NOTE: this is not a exact test that the graph is the correct output, but
+      it balances compact expressibility of test with sanity checking.
+
+    Args:
+      graphdef: TensorFlow proto graphdef.
+      output_nodes: A list of output node names that we need to reach.
+
+    Returns:
+      A set of node types reachable from `output_nodes`.
+    """
+    name_to_input_name, name_to_node, _ = (
+        _extract_graph_summary(graphdef))
+    # Find all nodes that are needed by the outputs
+    used_node_names = _bfs_for_reachable_nodes(output_nodes, name_to_input_name)
+    return set([name_to_node[node_name].op for node_name in used_node_names])
+
+  def _countIdentities(self, nodes):
+    """Count the number of "Identity" op types in the list of proto nodes.
+
+    Args:
+      nodes: NodeDefs of the graph.
+
+    Returns:
+      The number of nodes with op type "Identity" found.
+    """
+    return len([x for x in nodes if x.op == "Identity"])
+
+  def testSwishLiteHint(self):
+    """Makes a custom op swish and makes sure it gets converted as a unit."""
+    image = array_ops.constant([1., 2., 3., 4.])
+    swish_scale = array_ops.constant(1.0)
+
+    def _swish(input_tensor, scale):
+      custom = lite.OpHint("cool_activation")
+      input_tensor, scale = custom.add_inputs(input_tensor, scale)
+      output = math_ops.sigmoid(input_tensor) * input_tensor * scale
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(_swish(image, swish_scale), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # check if identities have been put into the graph (2 input, 1 output,
+      # and 1 final output).
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["cool_activation", "Const", "Identity"])
+
+  def testScaleAndBiasAndIdentity(self):
+    """This tests a scaled add which has 3 inputs and 2 outputs."""
+    a = array_ops.constant(1.)
+    x = array_ops.constant([2., 3.])
+    b = array_ops.constant([4., 5.])
+
+    def _scaled_and_bias_and_identity(a, x, b):
+      custom = lite.OpHint("scale_and_bias_and_identity")
+      a, x, b = custom.add_inputs(a, x, b)
+      return custom.add_outputs(a * x + b, x)
+    output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
+                                name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (3) and output (2) => 3 + 2 = 5
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
+
+  def testTwoFunctions(self):
+    """Tests if two functions are converted correctly."""
+    a = array_ops.constant([1.])
+    b = array_ops.constant([1.])
+    def _double_values(x):
+      custom = lite.OpHint("add_test")
+      x = custom.add_inputs(x)
+      output = math_ops.multiply(x, x)
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(
+        math_ops.add(_double_values(a), _double_values(b)), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (2) and output (2) => 2 + 2
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["add_test", "Const", "Identity", "Add"])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py
new file mode 100644
index 00000000000..7c587e38b16
--- /dev/null
+++ b/tensorflow/contrib/lite/python/op_hint.py
@@ -0,0 +1,291 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Define tflite op hints (intrinsic operations).
+
+This essentially allows defining a TensorFlow API for tflite operations in
+Python with hints on how they are represented in TensorFlow Lite. This basically
+is a form of tflite intrinsic. It wraps a subpart of a TensorFlow execution
+graph and is useful for LSTMs and other complicated TensorFlow constructions
+that are difficult to pattern match in TOCO, but are represented by a single
+accelerated tflite op.
+
+Example:
+  def tflite_cool_activation(input):
+    # A cool activation function.
+    custom = tf.contrib.lite.OpHint("cool_activation")
+    input = custom.add_inputs(input)
+    output = tf.sigmoid(input) * input
+    custom.add_outputs(output)
+    return output
+
+  image = tf.placeholder(tf.float32, (1, 16, 16, 1))
+  output = tf.identity(tflite_cool_activation(image))
+
+  session = tf.Session()
+
+  graphdef_to_convert = tf.contrib.lite.convert_op_hints_to_stubs(session)
+  tflite_graph = tf.contrib.lite.toco_convert(graphdef_to_convert,
+                                              [image], [output])
+                                              [image], [output])
+  with open("/tmp/graph.fb", "wb") as fp:
+    fp.write(tflite_graph)
+
+How does it work?:
+
+OpHint is a helper that you use when defining a vanilla python function.
+It allows you to wrap arguments with tf.identities with some custom attributes.
+These attributes allow you to find the original block of ops that was created.
+For example, if you use cool_activation above you essentially get:
+
+a_input = tf.identity()
+result = tf.multiply(tf.sigmoid(a_input), a_input)
+output = tf.identity()
+
+a_input, output are identities that have parameters representing
+what argument they are, what the name of the function they should turn into
+in tf lite as well as a guid that uniquely identifies a particular invocation.
+
+Once you have built your whole tensorflow graph, you can run it and train it
+as usual, but after you have done that, you need to convert the graph into
+a form that replaces these subgraphs wrapped in identities to stub ops. These
+ops don't actually exist in the normal TensorFlow runtime, but will be
+understood by toco later.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as _collections
+import itertools as _itertools
+import uuid as _uuid
+
+from tensorflow.contrib import framework as _framework
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+class OpHint(object):
+  """A class that helps build tflite function invocations.
+
+  It allows you to take a bunch of TensorFlow ops and annotate the construction
+  such that toco knows how to convert it to tflite. This embeds a pseudo
+  function in a TensorFlow graph. This allows embedding high-level API usage
+  information in a lower level TensorFlow implementation so that an alternative
+  implementation can be substituted later.
+
+  Essentially, any "input" into this pseudo op is fed into an identity, and
+  attributes are added to that input before being used by the constituent ops
+  that make up the pseudo op. A similar process is done to any output that
+  is to be exported from the current op.
+
+  TODO(aselle): When TensorFlow functions functionality works for arbitrary
+  constructs, this mechanism can be retired and changed to use python defun's.
+  """
+
+  # Attr constants that are used for representation in the GraphDef
+  FUNCTION_NAME_ATTR = "_tflite_function_name"
+  FUNCTION_UUID_ATTR = "_tflite_function_uuid"
+  FUNCTION_INPUT_INDEX_ATTR = "_tflite_function_input_index"
+  FUNCTION_OUTPUT_INDEX_ATTR = "_tflite_function_output_index"
+
+  def __init__(self, function_name, **kwargs):
+    """Create a OpHint.
+
+    Args:
+      function_name: Name of the function (the custom op name in tflite)
+      **kwargs: Keyword arguments of any constant attributes for the function.
+    """
+    self._function_name = function_name
+    self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
+    self._curr_input_index = 0
+    self._curr_output_index = 0
+    self._attrs_to_store_later = kwargs
+    self._stored_attrs = False
+
+  def _setattr(self, dest_op, name, value):
+    tensor_value = _ops.convert_to_tensor(value)
+    dest_op.op.node_def.attr[name].tensor.CopyFrom(
+        tensor_value.op.node_def.attr["value"].tensor)
+
+  def add_inputs(self, *args):
+    """Add a sequence of inputs to the function invocation.
+
+    Args:
+      *args: List of inputs to be converted (should be Tf.Tensor).
+    Returns:
+      Wrapped inputs (identity standins that have additional metadata). These
+      are also are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      attr = identity_op.op.node_def.attr
+      attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name
+      attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id
+      attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i = self._curr_input_index
+      self._curr_input_index += 1
+      return identity_op
+
+    return [augmented_identity(arg) for arg in args]
+
+  def add_outputs(self, *args):
+    """Add a sequence of outputs to the function invocation.
+
+    Args:
+      *args: List of outputs to be converted (should be tf.Tensor).
+    Returns:
+      Wrapped outputs (identity standins that have additional metadata). These
+      are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      attr = identity_op.op.node_def.attr
+      attr[OpHint.FUNCTION_NAME_ATTR].s = self._function_name
+      attr[OpHint.FUNCTION_UUID_ATTR].s = self._unique_function_id
+      attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i = self._curr_output_index
+      self._curr_output_index += 1
+      return identity_op
+
+    wrapped_outputs = [augmented_identity(arg) for arg in args]
+
+    if not self._stored_attrs:
+      for key, value in self._attrs_to_store_later.iteritems():
+        self._setattr(wrapped_outputs[0], "_tflite_attr_" + key, value)
+      self._stored_attrs = True
+
+    return wrapped_outputs
+
+
+class _LiteFuncCall(object):
+  """Represent a TensorFlow Lite custom function.
+
+  This is uses to accumulate found hints in the graphdef into a single
+  conceptual unit.
+
+  Properties:
+    self.inputs: inputs to the op (hash from index # to argument)
+    self.outputs: outputs to the op (hash from index # to argument)
+    self.function_name: the tflite custom op name to use
+    self.uuid: a unique call id for this particular call  (i.e.
+      multiple function calls would have the same function_name but different
+      uuids.
+    self.params: A param name to key value for op constant data. I.e. for
+      axis on a reduction, strides on a convolution, etc.
+  """
+
+  def __init__(self):
+    self.inputs = {}
+    self.outputs = {}
+    self.function_name = None
+    self.uuid = None
+    self.params = {}
+
+  def __str__(self):
+    return "tflite function %s call %s\n\tinputs: %r\n\toutputs: %r" % (
+        self.function_name, self.uuid, self.inputs, self.outputs)
+
+
+def _find_all_hints_in_graph_def(session):
+  """Look at the current default graph and return a list of LiteFuncCall objs.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    a list of `LifeFuncCall` objects in the form
+
+  """
+  func_calls = _collections.defaultdict(_LiteFuncCall)
+  seen_ops = set()
+
+  for op in session.graph.get_operations():
+    for operand in _itertools.chain(op.inputs, op.outputs):
+      if operand in seen_ops:
+        continue
+      seen_ops.add(operand)
+      attr = operand.op.node_def.attr
+      uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
+      if OpHint.FUNCTION_UUID_ATTR not in attr:
+        continue
+      call_def = func_calls[uuid]
+      call_def.uuid = uuid
+      if OpHint.FUNCTION_UUID_ATTR in attr:
+        call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+        if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
+          call_def.inputs[attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i] = operand
+        if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
+          call_def.outputs[attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i] = operand
+
+      for a in attr:
+        if a.startswith("_tflite_attr_"):
+          # TODO(aselle): Remember the attribute tensors so we can put them
+          # in collapse.
+          call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
+
+  return func_calls
+
+
+def _tensor_name_base(full_tensor_name):
+  """Removes the device assignment code from a tensor.
+
+  e.g. _tensor_name_base("foo:3") => "foo"
+
+  Args:
+    full_tensor_name: A tensor name that is annotated with a device placement
+      (this is what tensor flow introspection gives).
+  Returns:
+    A name without any device assignment.
+  """
+  return full_tensor_name.name.split(":")[0]
+
+
+def convert_op_hints_to_stubs(session):
+  """Converts a graphdef with LiteOp hints into stub operations.
+
+  This is used to prepare for toco conversion of complex intrinsic usages.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    A new graphdef with all ops contained in OpHints being replaced by
+    a single op call with the right parameters.
+  """
+  hints = _find_all_hints_in_graph_def(session)
+  current_graph_def = session.graph_def
+  for call in hints.values():
+    input_names = [None] * len(call.inputs)
+    output_names = [None] * len(call.outputs)
+    output_dtypes = [None] * len(call.outputs)
+    output_quantized = False
+    for input_index, tensor in call.inputs.items():
+      input_names[input_index] = _tensor_name_base(tensor)
+    for output_index, tensor in call.outputs.items():
+      output_names[output_index] = _tensor_name_base(tensor)
+      output_dtypes[output_index] = tensor.dtype.as_datatype_enum
+    # TODO(aselle): Support quantized flag properly
+    current_graph_def = _framework.fuse_op(
+        current_graph_def, input_names, output_names, output_dtypes,
+        output_quantized, call.uuid, call.function_name)
+    for node in current_graph_def.node:
+      if node.name == call.uuid:
+        for param, tensor in call.params.items():
+          node.attr[param].tensor.CopyFrom(tensor)
+  return current_graph_def
+
+
+_allowed_symbols = ["OpHint", "convert_op_hints_to_stubs"]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index ec202cd4073..c0b220e8721 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -119,6 +119,7 @@ enum BuiltinOperator : byte {
   SQUEEZE = 43,
   UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
 }
 
 // Options for the builtin operators.
@@ -224,6 +225,12 @@ table SequenceRNNOptions {
   fused_activation_function:ActivationFunctionType;
 }
 
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+}
+
 // An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
 table FullyConnectedOptions {
   fused_activation_function:ActivationFunctionType;
@@ -282,15 +289,9 @@ table ReshapeOptions {
 }
 
 table SpaceToBatchNDOptions {
-  block_shape:[int];
-  before_paddings:[int];
-  after_paddings:[int];
 }
 
 table BatchToSpaceNDOptions {
-  block_shape:[int];
-  before_crops:[int];
-  after_crops:[int];
 }
 
 table SkipGramOptions {
@@ -326,11 +327,9 @@ table GatherOptions {
 }
 
 table TransposeOptions {
-  perm:[int];
 }
 
 table MeanOptions {
-  axis:[int];
   keep_dims: bool;
 }
 
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index c04a73a2bf0..29f3a17be7a 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -51,6 +51,9 @@ struct RNNOptionsT;
 struct SequenceRNNOptions;
 struct SequenceRNNOptionsT;
 
+struct BidirectionalSequenceRNNOptions;
+struct BidirectionalSequenceRNNOptionsT;
+
 struct FullyConnectedOptions;
 struct FullyConnectedOptionsT;
 
@@ -211,11 +214,12 @@ enum BuiltinOperator {
   BuiltinOperator_SQUEEZE = 43,
   BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
   BuiltinOperator_STRIDED_SLICE = 45,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_STRIDED_SLICE
+  BuiltinOperator_MAX = BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[44] {
   static BuiltinOperator values[] = {
       BuiltinOperator_ADD,
       BuiltinOperator_AVERAGE_POOL_2D,
@@ -259,7 +263,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[43] {
       BuiltinOperator_DIV,
       BuiltinOperator_SQUEEZE,
       BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-      BuiltinOperator_STRIDED_SLICE};
+      BuiltinOperator_STRIDED_SLICE,
+      BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN};
   return values;
 }
 
@@ -310,6 +315,7 @@ inline const char **EnumNamesBuiltinOperator() {
                                 "SQUEEZE",
                                 "UNIDIRECTIONAL_SEQUENCE_LSTM",
                                 "STRIDED_SLICE",
+                                "BIDIRECTIONAL_SEQUENCE_RNN",
                                 nullptr};
   return names;
 }
@@ -2005,6 +2011,85 @@ flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
+  typedef BidirectionalSequenceRNNOptions TableType;
+  bool time_major;
+  ActivationFunctionType fused_activation_function;
+  BidirectionalSequenceRNNOptionsT()
+      : time_major(false),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef BidirectionalSequenceRNNOptionsT NativeTableType;
+  enum { VT_TIME_MAJOR = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  bool time_major() const { return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0; }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceRNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceRNNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR,
+                             static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(
+        BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+        static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit BidirectionalSequenceRNNOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BidirectionalSequenceRNNOptionsBuilder &operator=(
+      const BidirectionalSequenceRNNOptionsBuilder &);
+  flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
   typedef FullyConnectedOptions TableType;
   ActivationFunctionType fused_activation_function;
@@ -2541,21 +2626,14 @@ flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
 
 struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
-  int32_t new_height;
-  int32_t new_width;
-  ResizeBilinearOptionsT() : new_height(0), new_width(0) {}
+  ResizeBilinearOptionsT() {}
 };
 
 struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
-  enum { VT_NEW_HEIGHT = 4, VT_NEW_WIDTH = 6 };
-  int32_t new_height() const { return GetField<int32_t>(VT_NEW_HEIGHT, 0); }
-  int32_t new_width() const { return GetField<int32_t>(VT_NEW_WIDTH, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_NEW_HEIGHT) &&
-           VerifyField<int32_t>(verifier, VT_NEW_WIDTH) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   ResizeBilinearOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2570,13 +2648,6 @@ struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
 struct ResizeBilinearOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_new_height(int32_t new_height) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height,
-                             0);
-  }
-  void add_new_width(int32_t new_width) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_WIDTH, new_width, 0);
-  }
   explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2590,11 +2661,8 @@ struct ResizeBilinearOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb, int32_t new_height = 0,
-    int32_t new_width = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   ResizeBilinearOptionsBuilder builder_(_fbb);
-  builder_.add_new_width(new_width);
-  builder_.add_new_height(new_height);
   return builder_.Finish();
 }
 
@@ -2766,33 +2834,14 @@ flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
 
 struct SpaceToBatchNDOptionsT : public flatbuffers::NativeTable {
   typedef SpaceToBatchNDOptions TableType;
-  std::vector<int32_t> block_shape;
-  std::vector<int32_t> before_paddings;
-  std::vector<int32_t> after_paddings;
   SpaceToBatchNDOptionsT() {}
 };
 
 struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef SpaceToBatchNDOptionsT NativeTableType;
-  enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_PADDINGS = 6, VT_AFTER_PADDINGS = 8 };
-  const flatbuffers::Vector<int32_t> *block_shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_SHAPE);
-  }
-  const flatbuffers::Vector<int32_t> *before_paddings() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_PADDINGS);
-  }
-  const flatbuffers::Vector<int32_t> *after_paddings() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_PADDINGS);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_BLOCK_SHAPE) &&
-           verifier.Verify(block_shape()) &&
-           VerifyOffset(verifier, VT_BEFORE_PADDINGS) &&
-           verifier.Verify(before_paddings()) &&
-           VerifyOffset(verifier, VT_AFTER_PADDINGS) &&
-           verifier.Verify(after_paddings()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   SpaceToBatchNDOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2807,18 +2856,6 @@ struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS
 struct SpaceToBatchNDOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_block_shape(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_BLOCK_SHAPE, block_shape);
-  }
-  void add_before_paddings(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_paddings) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_BEFORE_PADDINGS, before_paddings);
-  }
-  void add_after_paddings(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_paddings) {
-    fbb_.AddOffset(SpaceToBatchNDOptions::VT_AFTER_PADDINGS, after_paddings);
-  }
   explicit SpaceToBatchNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2832,62 +2869,25 @@ struct SpaceToBatchNDOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_paddings = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_paddings = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   SpaceToBatchNDOptionsBuilder builder_(_fbb);
-  builder_.add_after_paddings(after_paddings);
-  builder_.add_before_paddings(before_paddings);
-  builder_.add_block_shape(block_shape);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<SpaceToBatchNDOptions>
-CreateSpaceToBatchNDOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *block_shape = nullptr,
-    const std::vector<int32_t> *before_paddings = nullptr,
-    const std::vector<int32_t> *after_paddings = nullptr) {
-  return tflite::CreateSpaceToBatchNDOptions(
-      _fbb, block_shape ? _fbb.CreateVector<int32_t>(*block_shape) : 0,
-      before_paddings ? _fbb.CreateVector<int32_t>(*before_paddings) : 0,
-      after_paddings ? _fbb.CreateVector<int32_t>(*after_paddings) : 0);
-}
-
 flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable {
   typedef BatchToSpaceNDOptions TableType;
-  std::vector<int32_t> block_shape;
-  std::vector<int32_t> before_crops;
-  std::vector<int32_t> after_crops;
   BatchToSpaceNDOptionsT() {}
 };
 
 struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
     : private flatbuffers::Table {
   typedef BatchToSpaceNDOptionsT NativeTableType;
-  enum { VT_BLOCK_SHAPE = 4, VT_BEFORE_CROPS = 6, VT_AFTER_CROPS = 8 };
-  const flatbuffers::Vector<int32_t> *block_shape() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BLOCK_SHAPE);
-  }
-  const flatbuffers::Vector<int32_t> *before_crops() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_BEFORE_CROPS);
-  }
-  const flatbuffers::Vector<int32_t> *after_crops() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AFTER_CROPS);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_BLOCK_SHAPE) &&
-           verifier.Verify(block_shape()) &&
-           VerifyOffset(verifier, VT_BEFORE_CROPS) &&
-           verifier.Verify(before_crops()) &&
-           VerifyOffset(verifier, VT_AFTER_CROPS) &&
-           verifier.Verify(after_crops()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   BatchToSpaceNDOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2902,18 +2902,6 @@ struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
 struct BatchToSpaceNDOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_block_shape(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BLOCK_SHAPE, block_shape);
-  }
-  void add_before_crops(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_BEFORE_CROPS, before_crops);
-  }
-  void add_after_crops(
-      flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops) {
-    fbb_.AddOffset(BatchToSpaceNDOptions::VT_AFTER_CROPS, after_crops);
-  }
   explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2927,29 +2915,11 @@ struct BatchToSpaceNDOptionsBuilder {
 };
 
 inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_shape = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> before_crops = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> after_crops = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   BatchToSpaceNDOptionsBuilder builder_(_fbb);
-  builder_.add_after_crops(after_crops);
-  builder_.add_before_crops(before_crops);
-  builder_.add_block_shape(block_shape);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<BatchToSpaceNDOptions>
-CreateBatchToSpaceNDOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *block_shape = nullptr,
-    const std::vector<int32_t> *before_crops = nullptr,
-    const std::vector<int32_t> *after_crops = nullptr) {
-  return tflite::CreateBatchToSpaceNDOptions(
-      _fbb, block_shape ? _fbb.CreateVector<int32_t>(*block_shape) : 0,
-      before_crops ? _fbb.CreateVector<int32_t>(*before_crops) : 0,
-      after_crops ? _fbb.CreateVector<int32_t>(*after_crops) : 0);
-}
-
 flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -3324,19 +3294,13 @@ flatbuffers::Offset<GatherOptions> CreateGatherOptions(
 
 struct TransposeOptionsT : public flatbuffers::NativeTable {
   typedef TransposeOptions TableType;
-  std::vector<int32_t> perm;
   TransposeOptionsT() {}
 };
 
 struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef TransposeOptionsT NativeTableType;
-  enum { VT_PERM = 4 };
-  const flatbuffers::Vector<int32_t> *perm() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_PERM);
-  }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PERM) &&
-           verifier.Verify(perm()) && verifier.EndTable();
+    return VerifyTableStart(verifier) && verifier.EndTable();
   }
   TransposeOptionsT *UnPack(
       const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3351,9 +3315,6 @@ struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct TransposeOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_perm(flatbuffers::Offset<flatbuffers::Vector<int32_t>> perm) {
-    fbb_.AddOffset(TransposeOptions::VT_PERM, perm);
-  }
   explicit TransposeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
       : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3367,41 +3328,27 @@ struct TransposeOptionsBuilder {
 };
 
 inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> perm = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb) {
   TransposeOptionsBuilder builder_(_fbb);
-  builder_.add_perm(perm);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *perm = nullptr) {
-  return tflite::CreateTransposeOptions(
-      _fbb, perm ? _fbb.CreateVector<int32_t>(*perm) : 0);
-}
-
 flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct MeanOptionsT : public flatbuffers::NativeTable {
   typedef MeanOptions TableType;
-  std::vector<int32_t> axis;
   bool keep_dims;
   MeanOptionsT() : keep_dims(false) {}
 };
 
 struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef MeanOptionsT NativeTableType;
-  enum { VT_AXIS = 4, VT_KEEP_DIMS = 6 };
-  const flatbuffers::Vector<int32_t> *axis() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_AXIS);
-  }
+  enum { VT_KEEP_DIMS = 4 };
   bool keep_dims() const { return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0; }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_AXIS) &&
-           verifier.Verify(axis()) &&
+    return VerifyTableStart(verifier) &&
            VerifyField<uint8_t>(verifier, VT_KEEP_DIMS) && verifier.EndTable();
   }
   MeanOptionsT *UnPack(
@@ -3417,9 +3364,6 @@ struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct MeanOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_axis(flatbuffers::Offset<flatbuffers::Vector<int32_t>> axis) {
-    fbb_.AddOffset(MeanOptions::VT_AXIS, axis);
-  }
   void add_keep_dims(bool keep_dims) {
     fbb_.AddElement<uint8_t>(MeanOptions::VT_KEEP_DIMS,
                              static_cast<uint8_t>(keep_dims), 0);
@@ -3437,22 +3381,12 @@ struct MeanOptionsBuilder {
 };
 
 inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> axis = 0,
-    bool keep_dims = false) {
+    flatbuffers::FlatBufferBuilder &_fbb, bool keep_dims = false) {
   MeanOptionsBuilder builder_(_fbb);
-  builder_.add_axis(axis);
   builder_.add_keep_dims(keep_dims);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<MeanOptions> CreateMeanOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int32_t> *axis = nullptr, bool keep_dims = false) {
-  return tflite::CreateMeanOptions(
-      _fbb, axis ? _fbb.CreateVector<int32_t>(*axis) : 0, keep_dims);
-}
-
 flatbuffers::Offset<MeanOptions> CreateMeanOptions(
     flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o,
     const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -5098,6 +5032,56 @@ inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
                                           _fused_activation_function);
 }
 
+inline BidirectionalSequenceRNNOptionsT *
+BidirectionalSequenceRNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BidirectionalSequenceRNNOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void BidirectionalSequenceRNNOptions::UnPackTo(
+    BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = time_major();
+    _o->time_major = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+BidirectionalSequenceRNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BidirectionalSequenceRNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _time_major = _o->time_major;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateBidirectionalSequenceRNNOptions(
+      _fbb, _time_major, _fused_activation_function);
+}
+
 inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(
     const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new FullyConnectedOptionsT();
@@ -5457,14 +5441,6 @@ inline void ResizeBilinearOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = new_height();
-    _o->new_height = _e;
-  };
-  {
-    auto _e = new_width();
-    _o->new_width = _e;
-  };
 }
 
 inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(
@@ -5484,9 +5460,7 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _new_height = _o->new_height;
-  auto _new_width = _o->new_width;
-  return tflite::CreateResizeBilinearOptions(_fbb, _new_height, _new_width);
+  return tflite::CreateResizeBilinearOptions(_fbb);
 }
 
 inline CallOptionsT *CallOptions::UnPack(
@@ -5616,33 +5590,6 @@ inline void SpaceToBatchNDOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = block_shape();
-    if (_e) {
-      _o->block_shape.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->block_shape[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = before_paddings();
-    if (_e) {
-      _o->before_paddings.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->before_paddings[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = after_paddings();
-    if (_e) {
-      _o->after_paddings.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->after_paddings[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(
@@ -5662,14 +5609,7 @@ inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _block_shape =
-      _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0;
-  auto _before_paddings =
-      _o->before_paddings.size() ? _fbb.CreateVector(_o->before_paddings) : 0;
-  auto _after_paddings =
-      _o->after_paddings.size() ? _fbb.CreateVector(_o->after_paddings) : 0;
-  return tflite::CreateSpaceToBatchNDOptions(_fbb, _block_shape,
-                                             _before_paddings, _after_paddings);
+  return tflite::CreateSpaceToBatchNDOptions(_fbb);
 }
 
 inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(
@@ -5684,33 +5624,6 @@ inline void BatchToSpaceNDOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = block_shape();
-    if (_e) {
-      _o->block_shape.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->block_shape[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = before_crops();
-    if (_e) {
-      _o->before_crops.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->before_crops[_i] = _e->Get(_i);
-      }
-    }
-  };
-  {
-    auto _e = after_crops();
-    if (_e) {
-      _o->after_crops.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->after_crops[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(
@@ -5730,14 +5643,7 @@ inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _block_shape =
-      _o->block_shape.size() ? _fbb.CreateVector(_o->block_shape) : 0;
-  auto _before_crops =
-      _o->before_crops.size() ? _fbb.CreateVector(_o->before_crops) : 0;
-  auto _after_crops =
-      _o->after_crops.size() ? _fbb.CreateVector(_o->after_crops) : 0;
-  return tflite::CreateBatchToSpaceNDOptions(_fbb, _block_shape, _before_crops,
-                                             _after_crops);
+  return tflite::CreateBatchToSpaceNDOptions(_fbb);
 }
 
 inline SkipGramOptionsT *SkipGramOptions::UnPack(
@@ -5999,15 +5905,6 @@ inline void TransposeOptions::UnPackTo(
     const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = perm();
-    if (_e) {
-      _o->perm.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->perm[_i] = _e->Get(_i);
-      }
-    }
-  };
 }
 
 inline flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(
@@ -6027,8 +5924,7 @@ inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _perm = _o->perm.size() ? _fbb.CreateVector(_o->perm) : 0;
-  return tflite::CreateTransposeOptions(_fbb, _perm);
+  return tflite::CreateTransposeOptions(_fbb);
 }
 
 inline MeanOptionsT *MeanOptions::UnPack(
@@ -6042,15 +5938,6 @@ inline void MeanOptions::UnPackTo(
     MeanOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  {
-    auto _e = axis();
-    if (_e) {
-      _o->axis.resize(_e->size());
-      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
-        _o->axis[_i] = _e->Get(_i);
-      }
-    }
-  };
   {
     auto _e = keep_dims();
     _o->keep_dims = _e;
@@ -6074,9 +5961,8 @@ inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
     const flatbuffers::rehasher_function_t *__rehasher;
   } _va = {&_fbb, _o, _rehasher};
   (void)_va;
-  auto _axis = _o->axis.size() ? _fbb.CreateVector(_o->axis) : 0;
   auto _keep_dims = _o->keep_dims;
-  return tflite::CreateMeanOptions(_fbb, _axis, _keep_dims);
+  return tflite::CreateMeanOptions(_fbb, _keep_dims);
 }
 
 inline SqueezeOptionsT *SqueezeOptions::UnPack(
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 50e8ca75f8e..b949045128f 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -197,7 +197,7 @@ cc_binary(
 
 tf_cc_test(
     name = "generated_examples_zip_test",
-    size = "medium",
+    size = "large",
     srcs = ["generated_examples_zip_test.cc"],
     args = [
         "--zip_files_dir=tensorflow/contrib/lite/testing/optest",
@@ -206,7 +206,7 @@ tf_cc_test(
         "--unzip_binary_path=/usr/bin/unzip",
     ],
     data = [":optest"],
-    shard_count = 10,
+    shard_count = 20,
     tags = ["no_oss"],
     deps = [
         ":parse_testdata_lib",
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index a6393516578..b2227a7c98f 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -94,7 +94,8 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
-    # BatchToSpaceND doesn't support cropping.
+    # BatchToSpaceND doesn't support cropping. This catches test cases with
+    # const tensors as crops.
     r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
@@ -694,6 +695,7 @@ def make_mean_tests(zip_path):
           [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
           [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
       ],
+      "const_axis": [True, False],
       "keep_dims": [True, False],
   }, {
       "input_dtype": [tf.float32, tf.int32, tf.int64],
@@ -704,6 +706,7 @@ def make_mean_tests(zip_path):
           -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
           [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
       ],
+      "const_axis": [True, False],
       "keep_dims": [True, False],
   }]
 
@@ -713,17 +716,31 @@ def make_mean_tests(zip_path):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
+
+    # Get axis as either a placeholder or constants.
+    if parameters["const_axis"]:
+      axis = parameters["axis"]
+      input_tensors = [input_tensor]
+    else:
+      if isinstance(parameters["axis"], list):
+        shape = [len(parameters["axis"])]
+      else:
+        shape = [0]  # shape for None or integers.
+      axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
+      input_tensors = [input_tensor, axis]
+
     out = tf.reduce_mean(
-        input_tensor,
-        axis=parameters["axis"],
-        keep_dims=parameters["keep_dims"])
-    return [input_tensor], [out]
+        input_tensor, axis=axis, keep_dims=parameters["keep_dims"])
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["input_dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"])
+    ]
+    if not parameters["const_axis"]:
+      if parameters["axis"]:
+        values.append(np.array(parameters["axis"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1318,12 +1335,16 @@ def make_space_to_batch_nd_tests(zip_path):
           "input_shape": [[1, 2, 2, 3], [2, 2, 4, 1]],
           "block_shape": [[1, 3], [2, 2]],
           "paddings": [[[0, 0], [0, 0]], [[0, 0], [2, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
       {
           "dtype": [tf.float32],
           "input_shape": [[2, 3, 7, 3]],
           "block_shape": [[1, 3], [2, 2]],
           "paddings": [[[0, 0], [2, 0]], [[1, 0], [1, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
       # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
       {
@@ -1331,23 +1352,47 @@ def make_space_to_batch_nd_tests(zip_path):
           "input_shape": [[1, 4, 4, 4, 1, 1]],
           "block_shape": [[2, 2, 2]],
           "paddings": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
       },
   ]
 
   def build_graph(parameters):
+    """Build a space_to_batch graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.space_to_batch_nd(input_tensor, parameters["block_shape"],
-                               parameters["paddings"])
-    return [input_tensor], [out]
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get paddings either as a const or as a placeholder (tensor).
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="paddings", shape=shape)
+      input_tensors.append(paddings)
+
+    out = tf.space_to_batch_nd(input_tensor, block_shape, paddings)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1361,6 +1406,8 @@ def make_batch_to_space_nd_tests(zip_path):
           "input_shape": [[12, 2, 2, 1]],
           "block_shape": [[1, 4], [2, 2], [3, 4]],
           "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
       },
       # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
       {
@@ -1368,23 +1415,47 @@ def make_batch_to_space_nd_tests(zip_path):
           "input_shape": [[8, 2, 2, 2, 1, 1]],
           "block_shape": [[2, 2, 2]],
           "crops": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
       },
   ]
 
   def build_graph(parameters):
+    """Build a batch_to_space graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.batch_to_space_nd(input_tensor, parameters["block_shape"],
-                               parameters["crops"])
-    return [input_tensor], [out]
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get crops either as a const or as a placeholder (tensor).
+    if parameters["constant_crops"]:
+      crops = parameters["crops"]
+    else:
+      shape = [len(parameters["crops"]), 2]
+      crops = tf.placeholder(dtype=tf.int32, name="crops", shape=shape)
+      input_tensors.append(crops)
+
+    out = tf.batch_to_space_nd(input_tensor, block_shape, crops)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_crops"]:
+      values.append(np.array(parameters["crops"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1397,29 +1468,44 @@ def make_transpose_tests(zip_path):
       "dtype": [tf.int32, tf.int64, tf.float32],
       "input_shape": [[2, 2, 3]],
       "perm": [[0, 1, 2], [0, 2, 1]],
+      "constant_perm": [True, False],
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 2, 3, 4]],
       "perm": [[0, 1, 2, 3], [3, 0, 1, 2]],
+      "constant_perm": [True, False],
   }, {
       "dtype": [tf.float32],
       "input_shape": [[1, 2, 3, 4, 5]],
       "perm": [[0, 1, 2, 3, 4]],
+      "constant_perm": [True, False],
   }]
 
   def build_graph(parameters):
+    """Build a transpose graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    out = tf.transpose(input_tensor, perm=parameters["perm"])
-    return [input_tensor], [out]
+
+    if parameters["constant_perm"]:
+      perm = parameters["perm"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["perm"]), 2]
+      perm = tf.placeholder(dtype=tf.int32, name="perm", shape=shape)
+      input_tensors = [input_tensor, perm]
+
+    out = tf.transpose(input_tensor, perm=perm)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_perm"]:
+      values.append(np.array(parameters["perm"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1474,9 +1560,11 @@ def make_strided_slice_tests(zip_path):
           "input_shape": [[12, 2, 2, 5]],
           "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
           "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
-          "strides": [None, [1, 1, 1, 1], [2, 1, 3, 1]],
-          "begin_mask": [None, 0, 1, 2, 8],
-          "end_mask": [None, 0, 1, 2, 8],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin_mask": [None, 1, 8],
+          "end_mask": [None, 1, 8],
+          "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
+          "constant_indices": [False, True],
       },
       # 2-D
       {
@@ -1485,9 +1573,11 @@ def make_strided_slice_tests(zip_path):
           "input_shape": [[2, 3]],
           "begin": [[0, 0], [1, 0]],
           "end": [[2, 3], [2, 2]],
-          "strides": [None, [1, 1], [2, 2]],
-          "begin_mask": [None, 0, 1, 2],
-          "end_mask": [None, 0, 1, 2],
+          "strides": [None, [2, 2]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False, True],
       },
       # Negative strides
       {
@@ -1497,8 +1587,10 @@ def make_strided_slice_tests(zip_path):
           "begin": [[0, -1]],
           "end": [[2, -3]],
           "strides": [[1, -1]],
-          "begin_mask": [None, 0, 1, 2],
-          "end_mask": [None, 0, 1, 2],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False],
       },
   ]
 
@@ -1508,23 +1600,29 @@ def make_strided_slice_tests(zip_path):
         dtype=parameters["dtype"],
         name="input",
         shape=parameters["input_shape"])
-    begin = tf.placeholder(
-        dtype=parameters["index_type"],
-        name="begin",
-        shape=[len(parameters["input_shape"])])
-    end = tf.placeholder(
-        dtype=parameters["index_type"],
-        name="end",
-        shape=[len(parameters["input_shape"])])
-    strides = (
-        tf.placeholder(
-            dtype=parameters["index_type"],
-            name="strides",
-            shape=[len(parameters["input_shape"])])
-        if parameters["strides"] is not None else None)
-    tensors = [input_tensor, begin, end]
-    if strides is not None:
-      tensors.append(strides)
+    if parameters["constant_indices"]:
+      begin = parameters["begin"]
+      end = parameters["end"]
+      strides = parameters["strides"]
+      tensors = [input_tensor]
+    else:
+      begin = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="begin",
+          shape=[len(parameters["input_shape"])])
+      end = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="end",
+          shape=[len(parameters["input_shape"])])
+      strides = (
+          tf.placeholder(
+              dtype=parameters["index_type"],
+              name="strides",
+              shape=[len(parameters["input_shape"])])
+          if parameters["strides"] is not None else None)
+      tensors = [input_tensor, begin, end]
+      if strides is not None:
+        tensors.append(strides)
     out = tf.strided_slice(
         input_tensor,
         begin,
@@ -1539,14 +1637,17 @@ def make_strided_slice_tests(zip_path):
     input_values = create_tensor_data(parameters["dtype"],
                                       parameters["input_shape"])
     index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
-    begin_values = np.array(parameters["begin"]).astype(index_type)
-    end_values = np.array(parameters["end"]).astype(index_type)
-    stride_values = (
-        np.array(parameters["strides"]).astype(index_type)
-        if parameters["strides"] is not None else None)
-    values = [input_values, begin_values, end_values]
-    if stride_values is not None:
-      values.append(stride_values)
+    values = [input_values]
+    if not parameters["constant_indices"]:
+      begin_values = np.array(parameters["begin"]).astype(index_type)
+      end_values = np.array(parameters["end"]).astype(index_type)
+      stride_values = (
+          np.array(parameters["strides"]).astype(index_type)
+          if parameters["strides"] is not None else None)
+      values.append(begin_values)
+      values.append(end_values)
+      if stride_values is not None:
+        values.append(stride_values)
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 41652a07d21..e8b425a5925 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -47,9 +47,7 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // Key is a substring of the test name and value is a bug number.
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
-    // Add doesn't support broadcasting.
-    {R"(^\/adda.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(^\/mula.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    // Sub and Div don't support broadcasting.
     {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
     {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
 
@@ -67,7 +65,11 @@ std::map<string, string> kBrokenTests = {
     // L2Norm only supports tensors with 4D or fewer.
     {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
-    // SpaceToBatch only supports 4D tensors.
+    // BatchToSpaceND doesn't support cropping. This catches test cases with
+    // non-const tensors as crops.
+    {R"(^\/batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\])", "70594634"},
+
+    // SpaceToBatchND only supports 4D tensors.
     {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
     // L2Norm only works for dim=-1.
@@ -92,7 +94,7 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/resize_bilinearalign_corners=True,.*,size=\[5,6\])", "72401483"},
 
     // Transpose only supports 1D-4D input tensors.
-    {R"(^\/transposedtype=.*,input_shape=\[.,.,.,.,.\],perm=.*)", "71545879"},
+    {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -239,8 +241,7 @@ INSTANTIATE_TESTS(avg_pool)
 INSTANTIATE_TESTS(space_to_batch_nd)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
-// TODO(b/71642435) re-enable this test
-// INSTANTIATE_TESTS(constant)
+INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
 INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 6fc7e5e3fdd..20c156a9326 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -205,6 +205,7 @@ cc_library(
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_unused_op.cc",
+        "graph_transformations/reorder_activation_functions.cc",
         "graph_transformations/resolve_batch_normalization.cc",
         "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 529df3cd2e5..4c70b01a9d0 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -621,7 +621,8 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
                             GraphDef* tensorflow_graph) {
   string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
-  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+  if (providing_op != nullptr &&
+      providing_op->type == OperatorType::kTensorFlowReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index 88e59664ec4..ab943f72d1d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -68,12 +68,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // TODO(b/72172404): Great many ops don't support activation function
-  // fusing. Switch to a categorizing function instead.
-  if (op->type == OperatorType::kConcatenation ||
-      op->type == OperatorType::kSlice ||
-      op->type == OperatorType::kTensorFlowReshape ||
-      op->type == OperatorType::kTensorFlowSplit) {
+  if (!OperatorSupportsFusedActivation(op->type)) {
     AddMessageF(
         "Not fusing activation function because the %s op doesn't support it",
         LogName(*op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index e11bebcd4e0..cf90ebe9969 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -144,6 +144,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
+DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index 082820fddcf..c363b93394f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/string_view.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -202,23 +201,6 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
   return true;
 }
 
-absl::string_view FindLongestCommonPrefix(absl::string_view a,
-                                          absl::string_view b) {
-  if (a.empty() || b.empty()) return absl::string_view();
-
-  const char* pa = a.data();
-  const char* pb = b.data();
-  size_t count = 0;
-  const ssize_t limit = std::min(a.size(), b.size());
-  while (count < limit && *pa == *pb) {
-    ++pa;
-    ++pb;
-    ++count;
-  }
-
-  return absl::string_view(a.data(), count);
-}
-
 }  // namespace
 
 bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 4fb3b6ae7a5..7f26884bc15 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1120,7 +1120,8 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
       stop += input_array.shape().dims(i);
     }
 
-    int dim_size = (stop - start) / op->strides[i];
+    int dim_size = ceil((stop - start) / static_cast<float>(op->strides[i]));
+    dim_size = dim_size < 0 ? 0 : dim_size;
     if (op->shrink_axis_mask & mask) {
       CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when "
                                "shrinking that axis";
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
new file mode 100644
index 00000000000..cabbc4d313b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
+  const auto ac_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& ac_op = *ac_it;
+  DCHECK(ac_op);
+
+  if (ac_op->type != OperatorType::kRelu6 &&
+      ac_op->type != OperatorType::kRelu1 &&
+      ac_op->type != OperatorType::kRelu) {
+    return false;
+  }
+
+  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
+  if (exchange_it == model->operators.end()) return false;
+  // Find the op producing the array passed to this activation function
+  std::unique_ptr<Operator>& exchange_op = *exchange_it;
+  DCHECK(exchange_op);
+
+  if (exchange_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+
+  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
+  const auto& exchange_op_input = exchange_op->inputs[0];
+  const auto& intermediate_array = exchange_op->outputs[0];
+  const auto& ac_op_output = ac_op->outputs[0];
+
+  int count_ops_consuming_output =
+      CountOpsWithInput(*model, intermediate_array);
+  DCHECK_GE(count_ops_consuming_output, 1);
+  if (count_ops_consuming_output > 1) {
+    AddMessageF(
+        "Not exchanging activation function with %s because it is consumed by "
+        "more than 1 other operator",
+        LogName(*exchange_op));
+    return false;
+  }
+
+  // Rewire by changing inputs, including all consumers.
+  Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
+  while (consumer) {
+    for (int i = 0; i < consumer->inputs.size(); ++i) {
+      if (consumer->inputs[i] == ac_op_output) {
+        consumer->inputs[i] = intermediate_array;
+      }
+    }
+    consumer = GetFirstOpWithInput(*model, ac_op_output);
+  }
+  ac_op->inputs[0] = exchange_op_input;
+  exchange_op->inputs[0] = ac_op_output;
+
+  // Finally, reorder operators.  Note that this only works when there are no
+  // other direct descendents of the exchange_op.
+  ac_op.swap(exchange_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 5ac449749ad..db68968bad1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -73,7 +73,7 @@ void CopyTensorSegments(const std::vector<Array*>& input_arrays,
 
 // Receives a series of input arrays of type Array and an integer showing the
 // axis on which those arrays will be concatenated. It returns the concatenated
-// arrray.
+// array.
 template <ArrayDataType A>
 void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
                               int concatenation_axis,
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index ca378af4c5c..9862dbe99d5 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -173,7 +173,8 @@ void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_float_data =
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
-  output_float_data.resize(input_flat_size);
+  output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
+                           0.f);
   if (input_tensor.float_val_size() == 1) {
     for (int i = 0; i < input_flat_size; i++) {
       output_float_data[i] = input_tensor.float_val(0);
@@ -203,7 +204,7 @@ void ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -229,7 +230,7 @@ void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -255,7 +256,7 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int64_val_size()) {
     for (int i = 0; i < input_tensor.int64_val_size(); i++) {
       output_int_data[i] = input_tensor.int64_val(i);
@@ -281,7 +282,7 @@ void ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_string_data =
       output_array->GetMutableBuffer<ArrayDataType::kString>().data;
-  output_string_data.resize(input_flat_size);
+  output_string_data.resize(RequiredBufferSizeForShape(output_array->shape()));
   if (input_flat_size != input_tensor.string_val_size()) {
     LOG(FATAL) << "Input_content string_val doesn't have the right "
                   "dimensions for this string tensor.";
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index d1af371fd4c..6fba8f2629f 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 #define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 
+#include <functional>
 #include <initializer_list>
 #include <memory>
 #include <string>
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.cc b/tensorflow/contrib/lite/toco/tensorflow_util.cc
index 82e2800ca2f..0e7e9c41a06 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.cc
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.cc
@@ -51,7 +51,8 @@ void LogDumpGraphDef(int log_level, const string& message,
 BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s)
 There are %d nodes.
 There are %zu different op types:
-)MSG", message, tf_graph.node_size(), ops.size());
+)MSG",
+                      message, tf_graph.node_size(), ops.size());
   for (const auto& op : ops) {
     toco::port::AppendF(&dump, "  %s\n", op);
   }
@@ -63,7 +64,8 @@ PROTO DUMP
 BEGIN NODE: name = %s
   op = %s
   inputs = [
-)MSG", node.name(), node.op());
+)MSG",
+                        node.name(), node.op());
     for (const auto& input : node.input()) {
       toco::port::AppendF(&dump, "    %s\n", input);
     }
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index 72c92665644..a2b8145a672 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -117,6 +117,7 @@ cc_library(
         ":types",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 391ef87029d..27719599708 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -26,6 +26,9 @@ namespace toco {
 
 namespace tflite {
 
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
 using ::tflite::Buffer;
 using ::tflite::BuiltinOperator;
 using ::tflite::BuiltinOperator_CUSTOM;
@@ -39,9 +42,6 @@ using ::tflite::Operator;
 using ::tflite::OperatorCode;
 using ::tflite::SubGraph;
 using ::tflite::Tensor;
-using flatbuffers::FlatBufferBuilder;
-using flatbuffers::Offset;
-using flatbuffers::Vector;
 
 namespace {
 
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
index bbf201fd288..5b1ab514b23 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -119,8 +120,16 @@ void ImportOperators(
     auto inputs = input_op->inputs();
     for (int i = 0; i < inputs->Length(); i++) {
       auto input_index = inputs->Get(i);
-      const string& input_name = tensors_table.at(input_index);
-      op->inputs.push_back(input_name);
+      // input_index == -1 indicates optional tensor.
+      if (input_index != -1) {
+        const string& input_name = tensors_table.at(input_index);
+        op->inputs.push_back(input_name);
+      } else {
+        const string& tensor_name =
+            toco::AvailableArrayName(*model, "OptionalTensor");
+        model->CreateOptionalArray(tensor_name);
+        op->inputs.push_back(tensor_name);
+      }
     }
     auto outputs = input_op->outputs();
     for (int i = 0; i < outputs->Length(); i++) {
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 298f49025f9..461494fd99e 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -140,25 +140,11 @@ class SpaceToBatchND
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto block_shape = builder->CreateVector(op.block_shape);
-    auto before_paddings = builder->CreateVector(op.before_paddings);
-    auto after_paddings = builder->CreateVector(op.after_paddings);
-    return ::tflite::CreateSpaceToBatchNDOptions(
-        *builder, block_shape, before_paddings, after_paddings);
+    return ::tflite::CreateSpaceToBatchNDOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-    op->block_shape.insert(op->block_shape.end(),
-                           options.block_shape()->begin(),
-                           options.block_shape()->end());
-    op->before_paddings.insert(op->before_paddings.end(),
-                               options.before_paddings()->begin(),
-                               options.before_paddings()->end());
-    op->after_paddings.insert(op->after_paddings.end(),
-                              options.after_paddings()->begin(),
-                              options.after_paddings()->end());
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
@@ -211,25 +197,11 @@ class BatchToSpaceND
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto block_shape = builder->CreateVector(op.block_shape);
-    auto before_crops = builder->CreateVector(op.before_crops);
-    auto after_crops = builder->CreateVector(op.after_crops);
-    return ::tflite::CreateBatchToSpaceNDOptions(*builder, block_shape,
-                                                 before_crops, after_crops);
+    return ::tflite::CreateBatchToSpaceNDOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-    op->block_shape.insert(op->block_shape.end(),
-                           options.block_shape()->begin(),
-                           options.block_shape()->end());
-    op->before_crops.insert(op->before_crops.end(),
-                            options.before_crops()->begin(),
-                            options.before_crops()->end());
-    op->after_crops.insert(op->after_crops.end(),
-                           options.after_crops()->begin(),
-                           options.after_crops()->end());
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Cast : public CustomOperator<CastOperator> {
@@ -478,8 +450,7 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Reshape
@@ -546,15 +517,11 @@ class Transpose
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    return ::tflite::CreateTransposeOptions(*builder,
-                                            builder->CreateVector(op.perm));
+    return ::tflite::CreateTransposeOptions(*builder);
   }
 
   void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-    op->perm.insert(op->perm.end(), options.perm()->begin(),
-                    options.perm()->end());
-  }
+                   TocoOperator* op) const override {}
 };
 
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
@@ -564,14 +531,11 @@ class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    auto axis = builder->CreateVector(op.axis);
-    return ::tflite::CreateMeanOptions(*builder, axis, op.keep_dims);
+    return ::tflite::CreateMeanOptions(*builder, op.keep_dims);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->axis.insert(op->axis.end(), options.axis()->begin(),
-                    options.axis()->end());
     op->keep_dims = options.keep_dims();
   }
 };
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 9036a16d1c9..6daa296282e 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -119,40 +119,12 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
-TEST_F(OperatorTest, BuiltinSpaceToBatchND) {
-  SpaceToBatchNDOperator op;
-  op.block_shape = {2, 2};
-  op.before_paddings = {1, 2};
-  op.after_paddings = {3, 4};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("SPACE_TO_BATCH_ND", OperatorType::kSpaceToBatchND), op);
-  EXPECT_EQ(op.block_shape, output_toco_op->block_shape);
-  EXPECT_EQ(op.before_paddings, output_toco_op->before_paddings);
-  EXPECT_EQ(op.after_paddings, output_toco_op->after_paddings);
-}
-
-TEST_F(OperatorTest, BuiltinBatchToSpaceND) {
-  BatchToSpaceNDOperator op;
-  op.block_shape = {2, 2};
-  op.before_crops = {1, 2};
-  op.after_crops = {3, 4};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("BATCH_TO_SPACE_ND", OperatorType::kBatchToSpaceND), op);
-  EXPECT_EQ(op.block_shape, output_toco_op->block_shape);
-  EXPECT_EQ(op.before_crops, output_toco_op->before_crops);
-  EXPECT_EQ(op.after_crops, output_toco_op->after_crops);
-}
-
 TEST_F(OperatorTest, BuiltinMean) {
   MeanOperator op;
-  op.axis = {1, 2};
   op.keep_dims = false;
 
   auto output_toco_op =
       SerializeAndDeserialize(GetOperator("MEAN", OperatorType::kMean), op);
-  EXPECT_EQ(op.axis, output_toco_op->axis);
   EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
 }
 
@@ -370,15 +342,6 @@ TEST_F(OperatorTest, Svdf) {
   EXPECT_EQ(op.rank, output_toco_op->rank);
 }
 
-TEST_F(OperatorTest, Transpose) {
-  TransposeOperator op;
-  op.perm = {0, 1, 2, 3};
-
-  auto output_toco_op = SerializeAndDeserialize(
-      GetOperator("TRANSPOSE", OperatorType::kTranspose), op);
-  EXPECT_EQ(op.perm, output_toco_op->perm);
-}
-
 TEST_F(OperatorTest, Squeeze) {
   SqueezeOperator op;
   op.squeeze_dims = {-2, -3, 4, 1, 4};
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index f8281f3a572..c5a62fdb620 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "For Protobuf formats, the binary format will be used."),
       Flag("input_format", parsed_flags.input_format.bind(),
            parsed_flags.input_format.default_value(),
-           "Input file format. One of: tensorflow_graphdef, "),
+           "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
       Flag("output_format", parsed_flags.output_format.bind(),
-           parsed_flags.output_format.default_value(), "Output file format."),
+           parsed_flags.output_format.default_value(),
+           "Output file format. "
+           "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
@@ -58,11 +60,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
-           "this may be overridden by inference_input_type)."),
+           "this may be overridden by inference_input_type). "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
            parsed_flags.inference_input_type.default_value(),
-           "Target data type of input arrays. If not specified, inference_type "
-           "is used."),
+           "Target data type of input arrays. "
+           "If not specified, inference_type is used. "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("input_type", parsed_flags.input_type.bind(),
            parsed_flags.input_type.default_value(),
            "Deprecated ambiguous flag that set both --input_data_types and "
@@ -76,35 +80,31 @@ bool ParseTocoFlagsFromCommandLineFlags(
 
       Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
            parsed_flags.drop_fake_quant.default_value(),
-           "Ignore and discard FakeQuant nodes. For instance, that can be used "
-           "to "
+           "Ignore and discard FakeQuant nodes. For instance, to "
            "generate plain float code without fake-quantization from a "
-           "quantized "
-           "graph."),
+           "quantized graph."),
       Flag(
           "reorder_across_fake_quant",
           parsed_flags.reorder_across_fake_quant.bind(),
           parsed_flags.reorder_across_fake_quant.default_value(),
           "Normally, FakeQuant nodes must be strict boundaries for graph "
           "transformations, in order to ensure that quantized inference has "
-          "the "
-          "exact same arithmetic behavior as quantized training --- which is "
-          "the "
-          "whole point of quantized training and of FakeQuant nodes in the "
-          "first "
-          "place. However, that entails subtle requirements on where exactly "
+          "the exact same arithmetic behavior as quantized training --- which "
+          "is the whole point of quantized training and of FakeQuant nodes in "
+          "the first place. "
+          "However, that entails subtle requirements on where exactly "
           "FakeQuant nodes must be placed in the graph. Some quantized graphs "
           "have FakeQuant nodes at unexpected locations, that prevent graph "
           "transformations that are necessary in order to generate inference "
           "code for these graphs. Such graphs should be fixed, but as a "
           "temporary work-around, setting this reorder_across_fake_quant flag "
-          "allows toco to perform necessary graph transformaitons on them, "
+          "allows TOCO to perform necessary graph transformaitons on them, "
           "at the cost of no longer faithfully matching inference and training "
           "arithmetic."),
       Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
            parsed_flags.allow_custom_ops.default_value(),
-           "If true, allow TOCO to create TF Lite Custom operators for all the"
-           "unsupported Tensorflow ops."),
+           "If true, allow TOCO to create TF Lite Custom operators for all the "
+           "unsupported TensorFlow ops."),
       Flag(
           "drop_control_dependency",
           parsed_flags.drop_control_dependency.bind(),
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 727df1cc76a..b715881774b 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -68,6 +68,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
+  transformations->Add(new ReorderActivationFunctions);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 187c426a5b3..d0bf66c542b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -33,6 +33,24 @@ limitations under the License.
 
 namespace toco {
 
+// Find the longest common prefix of two strings.
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b) {
+  if (a.empty() || b.empty()) return absl::string_view();
+
+  const char* pa = a.data();
+  const char* pb = b.data();
+  size_t count = 0;
+  const size_t limit = std::min(a.size(), b.size());
+  while (count < limit && *pa == *pb) {
+    ++pa;
+    ++pb;
+    ++count;
+  }
+
+  return absl::string_view(a.data(), count);
+}
+
 string LogName(const Operator& op) {
   const string& opname = HelpfulOperatorTypeName(op);
   if (op.outputs.empty()) {
@@ -286,6 +304,19 @@ string HelpfulOperatorTypeName(const Operator& op) {
   return OperatorTypeName(op.type);
 }
 
+bool OperatorSupportsFusedActivation(OperatorType type) {
+  switch (type) {
+    case OperatorType::kConcatenation:
+    case OperatorType::kSlice:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTensorFlowSplit:
+      return false;
+    default:
+      return true;
+  }
+}
+
 void LogSummary(int log_level, const Model& model) {
   VLOG(log_level) << "Operators summary (" << model.operators.size()
                   << " operators):";
@@ -1317,13 +1348,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
 }
 
 string AvailableArrayName(const Model& model, const string& name) {
-  if (!model.HasArray(name) && !model.optional_arrays.count(name)) {
+  if (!model.HasArray(name) && !model.IsOptionalArray(name)) {
     return name;
   }
   const int kNumSuffixesToTry = 1000;
   for (int i = 0; i < kNumSuffixesToTry; i++) {
     const string& name_with_suffix = toco::port::StringF("%s_%d", name, i);
-    if (!model.HasArray(name_with_suffix)) {
+    if (!model.HasArray(name_with_suffix) &&
+        !model.IsOptionalArray(name_with_suffix)) {
       return name_with_suffix;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 2ac51c7e5bb..a7e77a02eb1 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #if TOCO_SUPPORT_PORTABLE_PROTOS
 #include "third_party/protobuf/src/google/protobuf/text_format.h"
@@ -49,6 +50,8 @@ namespace toco {
 constexpr int kLogLevelModelChanged = 1;
 constexpr int kLogLevelModelUnchanged = 2;
 
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b);
 string LogName(const Operator& op);
 
 bool IsInputArray(const Model& model, const string& name);
@@ -79,6 +82,8 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
 const char* OperatorTypeName(OperatorType type);
 string HelpfulOperatorTypeName(const Operator& op);
 
+bool OperatorSupportsFusedActivation(OperatorType type);
+
 void DumpGraphvizVideoFrame(const Model& model);
 void LogDump(int log_level, const string& message, const Model& model);
 void LogSummary(int log_level, const string& message, const Model& model);
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 1bffcfb9873..6786b161845 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -99,8 +99,11 @@ cc_library(
     srcs = ["verifier.cc"],
     hdrs = ["verifier.h"],
     deps = [
+        "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
+        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -112,8 +115,10 @@ cc_test(
         ":verifier",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:framework_lite",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 95a08953798..726e2aaa316 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -14,13 +14,32 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/lite/tools/verifier.h"
+#include <climits>
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/version.h"
 
 namespace tflite {
 
 namespace {
 
+// Reports error message when the reporter is set.
+void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
+  if (error_reporter) {
+    va_list args;
+    va_start(args, format);
+    error_reporter->Report(format, args);
+    va_end(args);
+  }
+}
+
+// Returns the int32_t value pointed by ptr.
+const uint32_t* GetIntPtr(const char* ptr) {
+  return reinterpret_cast<const uint32_t*>(ptr);
+}
+
+// Verifies flatbuffer format of the model contents and returns the in-memory
+// model.
 const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
   if (VerifyModelBuffer(verifier)) {
@@ -30,14 +49,159 @@ const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
   }
 }
 
-}  // namespace
+const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
 
-bool Verify(const void* buf, size_t len) {
-  const Model* model = VerifyFlatbufferAndGetModel(buf, len);
-  if (model == nullptr) {
+// Verifies string tensor has legit buffer contents that follow the schema
+// defined in lite/string_util.h
+bool VerifyStringTensorBuffer(const Buffer& buffer,
+                              ErrorReporter* error_reporter) {
+  uint32_t buffer_size = buffer.data()->size();
+  const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
+
+  uint32_t num_strings = *GetIntPtr(buffer_ptr);
+  if (num_strings > kMaxNumString) {
+    ReportError(error_reporter,
+                "String tensor has invalid num of string set: %d", num_strings);
+    return false;
+  }
+  uint32_t header_offsets =
+      static_cast<uint32_t>(num_strings + 2) * sizeof(int32_t);
+
+  if (buffer_size < header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer requires at least %d bytes, but is "
+                "allocated with %d bytes",
+                header_offsets, buffer_size);
     return false;
   }
 
-  return model->version() == TFLITE_SCHEMA_VERSION;
+  uint32_t prev_ptr = header_offsets;
+  uint32_t offset = sizeof(int32_t);
+
+  if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer initial offset must be: %d",
+                header_offsets);
+    return false;
+  }
+  offset += sizeof(int32_t);
+  for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
+    int string_offset = *GetIntPtr(buffer_ptr + offset);
+    if (string_offset < prev_ptr || string_offset > buffer_size) {
+      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
+                  i);
+      return false;
+    }
+  }
+  if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
+    ReportError(error_reporter, "String tensor buffer last offset must be %d",
+                buffer_size);
+    return false;
+  }
+  return true;
+}
+
+// Verifies numeric tensor has legit buffer.
+bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
+                               ErrorReporter* error_reporter) {
+  uint64_t bytes_required = 1;
+  for (int dim : *tensor.shape()) {
+    bytes_required *= dim;
+    if (bytes_required > UINT_MAX) {
+      ReportError(error_reporter, "Tensor dimension overflow");
+      return false;
+    }
+  }
+  switch (tensor.type()) {
+    case TensorType_FLOAT32:
+      bytes_required *= sizeof(float);
+      break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
+    case TensorType_UINT8:
+      bytes_required *= sizeof(uint8_t);
+      break;
+    case TensorType_INT64:
+      bytes_required *= sizeof(int64_t);
+      break;
+    case TensorType_FLOAT16:
+      // FALLTHROUGH_INTENDED;
+    default:
+      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      return false;
+  }
+  if (bytes_required > UINT_MAX) {
+    ReportError(error_reporter, "Tensor dimension overflow");
+    return false;
+  }
+
+  if (bytes_required != buffer.data()->size()) {
+    ReportError(
+        error_reporter,
+        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
+        bytes_required, buffer.data()->size());
+    return false;
+  }
+  return true;
+
+  // TODO(yichengfan): verify quantized tensors.
+}
+
+// Verifies tensors have valid properties and legit buffer if set.
+bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
+  if (!model.subgraphs()) {
+    return true;
+  }
+  for (const auto& subgraph : *model.subgraphs()) {
+    if (!subgraph->tensors()) {
+      return true;
+    }
+    for (const auto& tensor : *subgraph->tensors()) {
+      if (!tensor->buffer()) {
+        return true;
+      }
+      if (tensor->buffer() >= model.buffers()->size()) {
+        ReportError(error_reporter, "Invalid tensor buffer index: %d",
+                    tensor->buffer());
+        return false;
+      }
+      auto* buffer = model.buffers()->Get(tensor->buffer());
+      if (!buffer || !buffer->data()) {
+        ReportError(error_reporter, "Tensor buffer %d not set",
+                    tensor->buffer());
+        return false;
+      }
+
+      if (tensor->type() == TensorType_STRING) {
+        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          return false;
+        }
+      } else {
+        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter) {
+  const Model* model = VerifyFlatbufferAndGetModel(buf, len);
+  if (model == nullptr) {
+    ReportError(error_reporter, "Invalid flatbuffer format");
+    return false;
+  }
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    ReportError(error_reporter, "Invalid model version %d", model->version());
+    return false;
+  }
+  if (!VerifyTensors(*model, error_reporter)) {
+    return false;
+  }
+  return true;
 }
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index 03e1f22b7e8..d2bf3c91d54 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -18,13 +18,15 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include "tensorflow/contrib/lite/error_reporter.h"
+
 namespace tflite {
 
 // Verifies the integrity of a Tensorflow Lite flatbuffer model file.
 // Currently, it verifies:
 // * The file is following a legit flatbuffer schema.
 // * The model is in supported version.
-bool Verify(const void* buf, size_t len);
+bool Verify(const void* buf, size_t len, ErrorReporter* error_reporter);
 
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 0481a55a78e..87f6854e9e6 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/tools/verifier.h"
+#include <string>
+#include <vector>
+
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
 #include <gtest/gtest.h>
@@ -20,7 +22,9 @@ limitations under the License.
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/tools/verifier.h"
 #include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
 
@@ -28,31 +32,62 @@ using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
 using flatbuffers::Vector;
 
-// Class that abstracts the list of buffers at the end of the TF Lite structure
-class DeferredBufferWriter {
+// Build single subgraph model.
+class TfLiteFlatbufferModelBuilder {
  public:
-  DeferredBufferWriter() {
-    data_.push_back({});  // sentinel empty buffer.
+  TfLiteFlatbufferModelBuilder() {
+    buffers_.push_back(
+        CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
   }
 
-  Offset<Vector<Offset<Buffer>>> BuildBuffers(FlatBufferBuilder *builder) {
-    std::vector<Offset<Buffer>> buffer_vector;
-    for (const auto &vec : data_) {
-      auto data_buffer = builder->CreateVector(vec.data(), vec.size());
-      buffer_vector.push_back(tflite::CreateBuffer(*builder, data_buffer));
+  void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
+                 const std::vector<uint8_t>& buffer, const char* name) {
+    int buffer_index = 0;
+    if (!buffer.empty()) {
+      buffer_index = buffers_.size();
+      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
     }
-    return builder->CreateVector(buffer_vector);
+    tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
+                                          name, /*quantization=*/0));
   }
 
-  // Registers a buffer index and takes ownership of the data to write to it.
-  int Record(std::vector<uint8_t> data) {
-    int buffer_index = data_.size();
-    data_.emplace_back(std::move(data));
-    return buffer_index;
+  void AddOperator(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs,
+                   tflite::BuiltinOperator builtin_op, const char* custom_op) {
+    operator_codes_.push_back(
+        CreateOperatorCodeDirect(builder_, builtin_op, custom_op));
+    operators_.push_back(CreateOperator(
+        builder_, operator_codes_.size() - 1, builder_.CreateVector(inputs),
+        builder_.CreateVector(outputs), BuiltinOptions_NONE,
+        /*builtin_options=*/0,
+        /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS));
+  }
+
+  void FinishModel(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs) {
+    auto subgraph = std::vector<Offset<SubGraph>>({CreateSubGraph(
+        builder_, builder_.CreateVector(tensors_),
+        builder_.CreateVector(inputs), builder_.CreateVector(outputs),
+        builder_.CreateVector(operators_),
+        builder_.CreateString("test_subgraph"))});
+    auto result = CreateModel(
+        builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(operator_codes_),
+        builder_.CreateVector(subgraph), builder_.CreateString("test_model"),
+        builder_.CreateVector(buffers_));
+    tflite::FinishModelBuffer(builder_, result);
+  }
+
+  bool Verify() {
+    return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
+                          DefaultErrorReporter());
   }
 
  private:
-  std::vector<std::vector<unsigned char>> data_;
+  FlatBufferBuilder builder_;
+  std::vector<Offset<Operator>> operators_;
+  std::vector<Offset<OperatorCode>> operator_codes_;
+  std::vector<Offset<Tensor>> tensors_;
+  std::vector<Offset<Buffer>> buffers_;
 };
 
 TEST(VerifyModel, TestEmptyModel) {
@@ -62,43 +97,26 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                     DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestSimpleModel) {
-  FlatBufferBuilder builder;
-  auto inputs = builder.CreateVector<int32_t>({0});
-  auto outputs = builder.CreateVector<int32_t>({1});
-  auto operator_codes = builder.CreateVector(std::vector<Offset<OperatorCode>>{
-      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "test")});
-  auto operators =
-      builder.CreateVector(std::vector<Offset<Operator>>{CreateOperator(
-          builder, /*opcode_index=*/0,
-          /*inputs=*/builder.CreateVector<int32_t>({0}),
-          /*outputs=*/builder.CreateVector<int32_t>({1}), BuiltinOptions_NONE,
-          /*builtin_options=*/0,
-          /*custom_options=*/0, ::tflite::CustomOptionsFormat_FLEXBUFFERS)});
-  std::vector<int> shape;
-  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
-      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
-                         "input", /*quantization=*/0),
-      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/0,
-                         "output", /*quantization=*/0)});
-  auto subgraph = std::vector<Offset<SubGraph>>(
-      {CreateSubGraph(builder, tensors, inputs, outputs, operators,
-                      builder.CreateString("Main"))});
-
-  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, operator_codes,
-                           builder.CreateVector(subgraph),
-                           builder.CreateString("SmartReply"), /*buffers=*/0);
-
-  ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
 }
 
 TEST(VerifyModel, TestCorruptedData) {
-  string model = "123";
-  ASSERT_FALSE(Verify(model.data(), model.size()));
+  std::string model = "123";
+  ASSERT_FALSE(Verify(model.data(), model.size(), /*error_reporter=*/nullptr));
 }
 
 TEST(VerifyModel, TestUnsupportedVersion) {
@@ -106,7 +124,8 @@ TEST(VerifyModel, TestUnsupportedVersion) {
   auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
-  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize()));
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
 }
 
 TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
@@ -116,20 +135,105 @@ TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
-  string model_content(reinterpret_cast<char *>(builder.GetBufferPointer()),
-                       builder.GetSize());
+  std::string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
+                            builder.GetSize());
   for (int i = 0; i < model_content.size(); i++) {
     model_content[i] = (model_content[i] + 137) % 255;
-    EXPECT_FALSE(Verify(model_content.data(), model_content.size()))
+    EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
+                        DefaultErrorReporter()))
         << "Fail at position: " << i;
   }
 }
 
+TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeOverflow) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({1024, 2048, 4096}, TensorType_UINT8, {1, 2, 3, 4},
+                    "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TensorBufferIsNotValid) {
+  FlatBufferBuilder builder;
+  std::vector<int> shape = {2, 3};
+  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/2,
+                         "input", /*quantization=*/0)});
+  auto subgraph = std::vector<Offset<SubGraph>>(
+      {CreateSubGraph(builder, tensors, /*inputs=*/0, /*outputs=*/0,
+                      /*operators=*/0, builder.CreateString("Main"))});
+
+  auto buffers = builder.CreateVector(std::vector<Offset<Buffer>>{
+      CreateBuffer(builder,
+                   builder.CreateVector(std::vector<uint8>{1, 2, 3, 4, 5, 6})),
+  });
+
+  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0,
+                           builder.CreateVector(subgraph),
+                           builder.CreateString("SmartReply"), buffers);
+
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, StringTensorHasInvalidNumString) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {0x00, 0x00, 0x00, 0x20, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetTooSmall) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetOutOfRange) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorIsLargerThanRequired) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B', 'C'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
 // TODO(yichengfan): make up malicious files to test with.
 
 }  // namespace tflite
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/tensorflow/contrib/lite/tools/visualize.py b/tensorflow/contrib/lite/tools/visualize.py
index d0d78e3afab..f571dd59da0 100644
--- a/tensorflow/contrib/lite/tools/visualize.py
+++ b/tensorflow/contrib/lite/tools/visualize.py
@@ -198,10 +198,13 @@ class TensorMapper(object):
 
 def GenerateGraph(subgraph_idx, g, opcode_mapper):
   """Produces the HTML required to have a d3 visualization of the dag."""
+
   def TensorName(idx):
-    return "t%d"%idx
+    return "t%d" % idx
+
   def OpName(idx):
-    return "o%d"%idx
+    return "o%d" % idx
+
   edges = []
   nodes = []
   first = {}
@@ -210,27 +213,35 @@ def GenerateGraph(subgraph_idx, g, opcode_mapper):
     for tensor_input_position, tensor_index in enumerate(op["inputs"]):
       if tensor_index not in first:
         first[tensor_index] = (
-            op_index*pixel_mult,
-            tensor_input_position*pixel_mult - pixel_mult/2)
-      edges.append(
-          {"source": TensorName(tensor_index), "target": OpName(op_index)})
+            op_index * pixel_mult,
+            tensor_input_position * pixel_mult - pixel_mult / 2)
+      edges.append({
+          "source": TensorName(tensor_index),
+          "target": OpName(op_index)
+      })
     for tensor_index in op["outputs"]:
-      edges.append(
-          {"target": TensorName(tensor_index), "source": OpName(op_index)})
-    nodes.append({"id": OpName(op_index),
-                  "name": opcode_mapper(op["opcode_index"]),
-                  "group": 2,
-                  "x": pixel_mult,
-                  "y": op_index * pixel_mult})
+      edges.append({
+          "target": TensorName(tensor_index),
+          "source": OpName(op_index)
+      })
+    nodes.append({
+        "id": OpName(op_index),
+        "name": opcode_mapper(op["opcode_index"]),
+        "group": 2,
+        "x": pixel_mult,
+        "y": op_index * pixel_mult
+    })
   for tensor_index, tensor in enumerate(g["tensors"]):
-    initial_y = (first[tensor_index] if tensor_index in first
-                 else len(g["operators"]))
+    initial_y = (
+        first[tensor_index] if tensor_index in first else len(g["operators"]))
 
-    nodes.append({"id": TensorName(tensor_index),
-                  "name": "%s (%d)" % (tensor["name"], tensor_index),
-                  "group": 1,
-                  "x": 2,
-                  "y": initial_y})
+    nodes.append({
+        "id": TensorName(tensor_index),
+        "name": "%s (%d)" % (tensor["name"], tensor_index),
+        "group": 1,
+        "x": 2,
+        "y": initial_y
+    })
   graph_str = json.dumps({"nodes": nodes, "edges": edges})
 
   html = _D3_HTML_TEMPLATE % (graph_str, subgraph_idx)
@@ -267,7 +278,7 @@ def GenerateTableHtml(items, keys_to_print, display_index=True):
     for h, mapper in keys_to_print:
       val = tensor[h] if h in tensor else None
       val = val if mapper is None else mapper(val)
-      html += "<td>%s</td>\n"%val
+      html += "<td>%s</td>\n" % val
 
     html += "</tr>\n"
   html += "</table>\n"
@@ -279,18 +290,19 @@ def CreateHtmlFile(tflite_input, html_output):
 
   # Convert the model into a JSON flatbuffer using flatc (build if doesn't
   # exist.
-  if  not os.path.exists(tflite_input):
+  if not os.path.exists(tflite_input):
     raise RuntimeError("Invalid filename %r" % tflite_input)
   if tflite_input.endswith(".tflite") or tflite_input.endswith(".bin"):
 
     # Run convert
-    cmd = (_BINARY + " -t "
-           "--strict-json --defaults-json -o /tmp {schema} -- {input}".format(
-               input=tflite_input, schema=_SCHEMA))
+    cmd = (
+        _BINARY + " -t "
+        "--strict-json --defaults-json -o /tmp {schema} -- {input}".format(
+            input=tflite_input, schema=_SCHEMA))
     print(cmd)
     os.system(cmd)
-    real_output = ("/tmp/"+ os.path.splitext(os.path.split(tflite_input)[-1])[0]
-                   + ".json")
+    real_output = ("/tmp/" + os.path.splitext(
+        os.path.split(tflite_input)[-1])[0] + ".json")
 
     data = json.load(open(real_output))
   elif tflite_input.endswith(".json"):
@@ -302,12 +314,13 @@ def CreateHtmlFile(tflite_input, html_output):
   html += "<h1>TensorFlow Lite Model</h2>"
 
   data["filename"] = tflite_input  # Avoid special case
-  toplevel_stuff = [("filename", None), ("version", None),
-                    ("description", None)]
+  toplevel_stuff = [("filename", None), ("version", None), ("description",
+                                                            None)]
 
   html += "<table>\n"
   for key, mapping in toplevel_stuff:
-    if not mapping: mapping = lambda x: x
+    if not mapping:
+      mapping = lambda x: x
     html += "<tr><th>%s</th><td>%s</td></tr>\n" % (key, mapping(data[key]))
   html += "</table>\n"
 
@@ -320,22 +333,22 @@ def CreateHtmlFile(tflite_input, html_output):
     html += "<div class='subgraph'>"
     tensor_mapper = TensorMapper(g)
     opcode_mapper = OpCodeMapper(data)
-    op_keys_to_display = [
-        ("inputs", tensor_mapper), ("outputs", tensor_mapper),
-        ("builtin_options", None), ("opcode_index", opcode_mapper)]
-    tensor_keys_to_display = [
-        ("name", None), ("type", None), ("shape", None), ("buffer", None),
-        ("quantization", None)]
+    op_keys_to_display = [("inputs", tensor_mapper), ("outputs", tensor_mapper),
+                          ("builtin_options", None), ("opcode_index",
+                                                      opcode_mapper)]
+    tensor_keys_to_display = [("name", None), ("type", None), ("shape", None),
+                              ("buffer", None), ("quantization", None)]
 
     html += "<h2>Subgraph %d</h2>\n" % subgraph_idx
 
     # Inputs and outputs.
     html += "<h3>Inputs/Outputs</h3>\n"
-    html += GenerateTableHtml([{"inputs": g["inputs"],
-                                "outputs": g["outputs"]}],
-                              [("inputs", tensor_mapper),
-                               ("outputs", tensor_mapper)],
-                              display_index=False)
+    html += GenerateTableHtml(
+        [{
+            "inputs": g["inputs"],
+            "outputs": g["outputs"]
+        }], [("inputs", tensor_mapper), ("outputs", tensor_mapper)],
+        display_index=False)
 
     # Print the tensors.
     html += "<h3>Tensors</h3>\n"
@@ -357,8 +370,7 @@ def CreateHtmlFile(tflite_input, html_output):
 
   # Operator codes
   html += "<h2>Operator Codes</h2>\n"
-  html += GenerateTableHtml(data["operator_codes"],
-                            operator_keys_to_display)
+  html += GenerateTableHtml(data["operator_codes"], operator_keys_to_display)
 
   html += "</body></html>\n"
 
@@ -370,10 +382,10 @@ def main(argv):
     tflite_input = argv[1]
     html_output = argv[2]
   except IndexError:
-    print ("Usage: %s <input tflite> <output html>" % (argv[0]))
+    print("Usage: %s <input tflite> <output html>" % (argv[0]))
   else:
     CreateHtmlFile(tflite_input, html_output)
 
+
 if __name__ == "__main__":
   main(sys.argv)
-
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 7c523ad4926..8c3a8afe7a0 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -30,20 +30,13 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 
-__all__ = ["absolute_difference",
-           "add_loss",
-           "cosine_distance",
-           "compute_weighted_loss",
-           "get_losses",
-           "get_regularization_losses",
-           "get_total_loss",
-           "hinge_loss",
-           "log_loss",
-           "mean_pairwise_squared_error",
-           "mean_squared_error",
-           "sigmoid_cross_entropy",
-           "softmax_cross_entropy",
-           "sparse_softmax_cross_entropy"]
+__all__ = [
+    "absolute_difference", "add_loss", "cosine_distance",
+    "compute_weighted_loss", "get_losses", "get_regularization_losses",
+    "get_total_loss", "hinge_loss", "log_loss", "mean_pairwise_squared_error",
+    "mean_squared_error", "sigmoid_cross_entropy", "softmax_cross_entropy",
+    "sparse_softmax_cross_entropy"
+]
 
 
 def _scale_losses(losses, weights):
@@ -66,8 +59,8 @@ def _scale_losses(losses, weights):
   # First, compute the sum of the losses over all elements:
   start_index = max(0, weights.get_shape().ndims)
   reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(losses,
-                                       reduction_indices=reduction_indices)
+  reduced_losses = math_ops.reduce_sum(
+      losses, reduction_indices=reduction_indices)
   reduced_losses = math_ops.multiply(reduced_losses, weights)
   return math_ops.reduce_sum(reduced_losses)
 
@@ -90,9 +83,10 @@ def _safe_div(numerator, denominator, name="value"):
   """
   return array_ops.where(
       math_ops.greater(denominator, 0),
-      math_ops.div(numerator, array_ops.where(
-          math_ops.equal(denominator, 0),
-          array_ops.ones_like(denominator), denominator)),
+      math_ops.div(numerator,
+                   array_ops.where(
+                       math_ops.equal(denominator, 0),
+                       array_ops.ones_like(denominator), denominator)),
       array_ops.zeros_like(numerator),
       name=name)
 
@@ -176,14 +170,15 @@ def _num_present(losses, weights, per_batch=False):
   """
   # If weights is a scalar, its easy to compute:
   if weights.get_shape().ndims == 0:
-    batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
-                                                   [0], [1]), [])
-    num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
-                                 math_ops.to_float(batch_size))
-    num_per_batch = array_ops.where(math_ops.equal(weights, 0),
-                                    0.0, num_per_batch)
-    num_per_batch = math_ops.multiply(array_ops.ones(
-        array_ops.reshape(batch_size, [1])), num_per_batch)
+    batch_size = array_ops.reshape(
+        array_ops.slice(array_ops.shape(losses), [0], [1]), [])
+    num_per_batch = math_ops.div(
+        math_ops.to_float(array_ops.size(losses)),
+        math_ops.to_float(batch_size))
+    num_per_batch = array_ops.where(
+        math_ops.equal(weights, 0), 0.0, num_per_batch)
+    num_per_batch = math_ops.multiply(
+        array_ops.ones(array_ops.reshape(batch_size, [1])), num_per_batch)
     return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
 
   # First, count the number of nonzero weights:
@@ -194,8 +189,8 @@ def _num_present(losses, weights, per_batch=False):
         reduction_indices=reduction_indices)
 
   # Next, determine the number of elements that weights would broadcast to:
-  broadcast_dims = array_ops.slice(array_ops.shape(losses),
-                                   [weights.get_shape().ndims], [-1])
+  broadcast_dims = array_ops.slice(
+      array_ops.shape(losses), [weights.get_shape().ndims], [-1])
   num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))
 
   num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast)
@@ -303,8 +298,11 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.sigmoid_cross_entropy instead. Note that the order "
             "of the predictions and labels arguments has been changed.")
-def sigmoid_cross_entropy(
-    logits, multi_class_labels, weights=1.0, label_smoothing=0, scope=None):
+def sigmoid_cross_entropy(logits,
+                          multi_class_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -340,20 +338,22 @@ def sigmoid_cross_entropy(
     multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
 
     if label_smoothing > 0:
-      multi_class_labels = (multi_class_labels * (1 - label_smoothing) +
-                            0.5 * label_smoothing)
+      multi_class_labels = (
+          multi_class_labels * (1 - label_smoothing) + 0.5 * label_smoothing)
 
-    losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.sigmoid_cross_entropy_with_logits(
+        labels=multi_class_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
 @deprecated("2016-12-30",
             "Use tf.losses.softmax_cross_entropy instead. Note that the order "
             "of the logits and labels arguments has been changed.")
-def softmax_cross_entropy(
-    logits, onehot_labels, weights=1.0, label_smoothing=0, scope=None):
+def softmax_cross_entropy(logits,
+                          onehot_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -393,9 +393,8 @@ def softmax_cross_entropy(
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
 
-    losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.softmax_cross_entropy_with_logits(
+        labels=onehot_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -429,9 +428,8 @@ def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None):
                       [logits, labels, weights]) as scope:
     labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]])
 
-    losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
-                                                         logits=logits,
-                                                         name="xentropy")
+    losses = nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -470,8 +468,7 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
     predictions = math_ops.to_float(predictions)
     labels = math_ops.to_float(labels)
     losses = -math_ops.multiply(
-        labels,
-        math_ops.log(predictions + epsilon)) - math_ops.multiply(
+        labels, math_ops.log(predictions + epsilon)) - math_ops.multiply(
             (1 - labels), math_ops.log(1 - predictions + epsilon))
     return compute_weighted_loss(losses, weights, scope=scope)
 
@@ -490,7 +487,8 @@ def hinge_loss(logits, labels=None, scope=None):
     scope: The scope for the operations performed in computing the loss.
 
   Returns:
-    An unweighted `Tensor` of same shape as `logits` and `labels` representing the
+    An unweighted `Tensor` of same shape as `logits` and `labels` representing
+    the
       loss values across the batch.
 
   Raises:
@@ -544,8 +542,10 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.mean_pairwise_squared_error instead. Note that the "
             "order of the predictions and labels arguments has been changed.")
-def mean_pairwise_squared_error(
-    predictions, labels=None, weights=1.0, scope=None):
+def mean_pairwise_squared_error(predictions,
+                                labels=None,
+                                weights=1.0,
+                                scope=None):
   """Adds a pairwise-errors-squared loss to the training procedure.
 
   Unlike `mean_squared_error`, which is a measure of the differences between
@@ -602,31 +602,34 @@ def mean_pairwise_squared_error(
     reduction_indices = list(range(1, diffs.get_shape().ndims))
 
     sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs),
-        reduction_indices=reduction_indices)
+        math_ops.square(diffs), reduction_indices=reduction_indices)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch)
+    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, num_present_per_batch)
 
     sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch))
+    term2 = 2.0 * _safe_div(
+        math_ops.square(sum_diff), math_ops.square(num_present_per_batch))
 
     loss = _scale_losses(term1 - term2, weights)
 
-    mean_loss = array_ops.where(math_ops.reduce_sum(num_present_per_batch) > 0,
-                                loss,
-                                array_ops.zeros_like(loss),
-                                name="value")
+    mean_loss = array_ops.where(
+        math_ops.reduce_sum(num_present_per_batch) > 0,
+        loss,
+        array_ops.zeros_like(loss),
+        name="value")
     add_loss(mean_loss)
     return mean_loss
 
 
 @deprecated("2016-12-30", "Use tf.losses.cosine_distance instead.")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
-def cosine_distance(
-    predictions, labels=None, axis=None, weights=1.0, scope=None, dim=None):
+def cosine_distance(predictions,
+                    labels=None,
+                    axis=None,
+                    weights=1.0,
+                    scope=None,
+                    dim=None):
   """Adds a cosine-distance loss to the training procedure.
 
   Note that the function assumes that `predictions` and `labels` are already
@@ -662,5 +665,8 @@ def cosine_distance(
     labels = math_ops.to_float(labels)
 
     radial_diffs = math_ops.multiply(predictions, labels)
-    losses = 1 - math_ops.reduce_sum(radial_diffs, reduction_indices=[axis,])
+    losses = 1 - math_ops.reduce_sum(
+        radial_diffs, reduction_indices=[
+            axis,
+        ])
     return compute_weighted_loss(losses, weights, scope=scope)
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 9d0f95e6f3e..1417772e049 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -274,6 +275,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
+@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -471,7 +473,11 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      if ops._USE_C_API:
+        error_type = ValueError
+      else:
+        error_type = errors_impl.InvalidArgumentError
+      with self.assertRaises(error_type):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index c50f8ceec0a..81327407d44 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -407,7 +407,7 @@ $(MARCH_OPTION) \
 -I$(JETPACK)/cuda/extras/CUPTI/include
 
 
-		LIBS += \
+		CUDA_LIBS := \
 -ltfcuda \
 -lcudart_static \
 -lcudnn \
@@ -420,10 +420,10 @@ $(MARCH_OPTION) \
 -lculibos \
 -lcurand_static
 
-		OBJDIR := $(OBJDIR)Tegra/
-		LIBDIR := $(LIBDIR)Tegra/
-		BINDIR := $(BINDIR)Tegra/
-		DEPDIR := $(DEPDIR)Tegra/
+		OBJDIR := $(OBJDIR)android_arm64-v8a/
+		LIBDIR := $(LIBDIR)android_arm64-v8a/
+		BINDIR := $(BINDIR)android_arm64-v8a/
+		DEPDIR := $(DEPDIR)android_arm64-v8a/
 
 		TEGRA_LIBS := \
 -L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib \
@@ -606,7 +606,8 @@ $(wildcard tensorflow/core/util/*/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
-CORE_CC_EXCLUDE_SRCS := \
+
+CORE_CC_EXCLUDE_SRCS_NON_GPU := \
 $(wildcard tensorflow/core/*/*test.cc) \
 $(wildcard tensorflow/core/*/*testutil*) \
 $(wildcard tensorflow/core/*/*testlib*) \
@@ -626,49 +627,31 @@ $(wildcard tensorflow/core/lib/jpeg/*) \
 $(wildcard tensorflow/core/lib/png/*) \
 $(wildcard tensorflow/core/util/events_writer.*) \
 $(wildcard tensorflow/core/util/reporter.*) \
-$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
-$(wildcard tensorflow/core/platform/default/stream_executor.*) \
 $(wildcard tensorflow/core/platform/default/test_benchmark.*) \
-$(wildcard tensorflow/core/platform/cuda.h) \
-$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
 $(wildcard tensorflow/core/platform/cloud/*) \
 $(wildcard tensorflow/core/platform/google/*) \
 $(wildcard tensorflow/core/platform/google/*/*) \
 $(wildcard tensorflow/core/platform/jpeg.*) \
 $(wildcard tensorflow/core/platform/png.*) \
 $(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/stream_executor.*) \
 $(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/user_ops/*.cu.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*) \
-$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \
 $(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \
 $(wildcard tensorflow/core/grappler/inputs/file_input_yielder.*) \
-$(wildcard tensorflow/core/grappler/clusters/single_machine.*)
+$(wildcard tensorflow/core/grappler/clusters/single_machine.*) \
+tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+CORE_CC_EXCLUDE_SRCS := \
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU) \
+$(wildcard tensorflow/core/platform/stream_executor.*) \
+$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/platform/cuda.h) \
+$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/user_ops/*.cu.cc) \
+$(wildcard tensorflow/core/common_runtime/gpu/*) \
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
 
 ifeq ($(BUILD_FOR_TEGRA),1)
-CORE_CC_ALL_SRCS := \
-$(wildcard tensorflow/core/*.cc) \
-$(wildcard tensorflow/core/common_runtime/*.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
-$(wildcard tensorflow/core/framework/*.cc) \
-$(wildcard tensorflow/core/graph/*.cc) \
-$(wildcard tensorflow/core/platform/*.cc) \
-$(wildcard tensorflow/core/platform/*/*.cc) \
-$(wildcard tensorflow/core/platform/*/*/*.cc) \
-$(wildcard tensorflow/core/util/*.cc) \
-$(wildcard tensorflow/core/util/*/*.cc) \
-$(wildcard tensorflow/cc/training/*.cc) \
-$(wildcard tensorflow/stream_executor/*.cc) \
-$(wildcard tensorflow/stream_executor/*/*.cc) \
-$(wildcard tensorflow/core/grappler/optimizers/*.cc) \
-$(wildcard tensorflow/core/grappler/*.cc) \
-$(wildcard tensorflow/core/grappler/costs/*.cc) \
-$(wildcard tensorflow/core/grappler/clusters/*.cc) \
-$(wildcard tensorflow/core/grappler/utils/*.cc) \
-$(wildcard tensorflow/core/lib/core/*.cc) \
-$(wildcard tensorflow/core/lib/*/*.cc) \
-tensorflow/core/grappler/inputs/utils.cc \
+CORE_CC_ALL_SRCS := $(CORE_CC_ALL_SRCS) \
 tensorflow/core/kernels/concat_lib_gpu.cc \
 tensorflow/core/kernels/cuda_solvers.cc \
 tensorflow/core/kernels/cudnn_pooling_gpu.cc \
@@ -677,28 +660,14 @@ tensorflow/core/kernels/fractional_avg_pool_op.cc \
 tensorflow/core/kernels/fractional_max_pool_op.cc \
 tensorflow/core/kernels/fractional_pool_common.cc \
 tensorflow/core/kernels/pooling_ops_3d.cc \
-tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+tensorflow/core/kernels/sparse_fill_empty_rows_op.cc \
+tensorflow/core/kernels/list_kernels.cc \
+$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
+$(wildcard tensorflow/stream_executor/*.cc) \
+$(wildcard tensorflow/stream_executor/*/*.cc)
 
 CORE_CC_EXCLUDE_SRCS := \
-$(wildcard tensorflow/core/*/*test.cc) \
-$(wildcard tensorflow/core/*/*testutil*) \
-$(wildcard tensorflow/core/*/*testlib*) \
-$(wildcard tensorflow/core/*/*/*test.cc) \
-$(wildcard tensorflow/core/*/*/*testutil*) \
-$(wildcard tensorflow/core/framework/op_gen_lib.cc) \
-$(wildcard tensorflow/core/lib/gif/*) \
-$(wildcard tensorflow/core/lib/jpeg/*) \
-$(wildcard tensorflow/core/lib/png/*) \
-$(wildcard tensorflow/core/lib/db/*) \
-$(wildcard tensorflow/core/platform/jpeg.*) \
-$(wildcard tensorflow/core/platform/png.*) \
-$(wildcard tensorflow/core/platform/cloud/*) \
-$(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/*/*/*testlib*) \
-$(wildcard tensorflow/cc/training/*test.cc) \
-tensorflow/core/lib/io/record_reader.cc \
-tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU)
 
 CUDA_CC_SRCS := $(wildcard tensorflow/core/kernels/*.cu.cc)
 CUDA_CC_OBJS := $(addprefix $(OBJDIR), $(CUDA_CC_SRCS:.cc=.o))
@@ -760,7 +729,7 @@ $(BENCHMARK_NAME): $(BENCHMARK_OBJS) $(LIB_PATH) $(CUDA_LIB_DEPS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(BENCHMARK_NAME) $(BENCHMARK_OBJS) \
-	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS) $(CUDA_LIBS)
 
 # NVCC compilation rules for Tegra
 ifeq ($(BUILD_FOR_TEGRA),1)
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 281c4653c62..f67c5161861 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -37,7 +37,7 @@ fi
 
 ARCH=armeabi-v7a
 
-while getopts "Es:t:Tx:a" opt_name; do
+while getopts "Es:t:Tx:a:" opt_name; do
   case "$opt_name" in
     E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     s) SUB_MAKEFILES="${OPTARG}";;
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index a18df256f97..2d997918397 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -96,7 +96,7 @@ if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
 
     if [[ -z "${BUILD_ARCH}" ]]; then
         # Compile protobuf for the target iOS device architectures.
-        tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${DEFAULT_ARCH}
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh
     else
         # Compile protobuf for the target iOS device architectures.
         tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${BUILD_ARCH}
diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
index d9277ed60cb..3081084ee76 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
@@ -54,7 +54,7 @@ $(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS) $(CUDA_LIB_DEPS)
 	-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) $(TEGRA_LIBS) \
 	$(LIBFLAGS) $(LDFLAGS) \
 	-shared -Wl,-soname,$(INFERENCE_SO_NAME) \
-	$(LIBS)
+	$(LIBS) $(CUDA_LIBS)
 
 $(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH)
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 5f275663986..5a812af4e95 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -91,6 +91,7 @@ tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
 tensorflow/core/kernels/reduction_ops_all.cc
+tensorflow/core/kernels/roll_op.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -270,6 +271,7 @@ tensorflow/core/ops/parsing_ops.cc
 tensorflow/core/ops/no_op.cc
 tensorflow/core/ops/nn_ops.cc
 tensorflow/core/ops/nn_grad.cc
+tensorflow/core/ops/manip_ops.cc
 tensorflow/core/ops/math_ops.cc
 tensorflow/core/ops/math_grad.cc
 tensorflow/core/ops/logging_ops.cc
@@ -291,3 +293,4 @@ tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
+tensorflow/core/ops/audio_ops.cc
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 39c0d5af45b..974fb537499 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -80,9 +80,9 @@ REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_GPU).HostMemory("out"),
                         BytesLimitOp);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"),
-                        BytesLimitOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"), BytesLimitOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 // Op that measures the peak memory in bytes.
 class MaxBytesInUseOp : public MemoryStatsOp {
@@ -107,6 +107,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("MaxBytesInUse").Device(DEVICE_SYCL).HostMemory("out"),
     MaxBytesInUseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index 2932ae1c8df..ff88b4fa841 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -171,7 +171,14 @@ def _clean_save_and_restore(graph_def, op, removed_op_names):
   shape_op_value_tensor.tensor_shape.dim[0].size = len(shapes)
   op.attr['dtypes'].list.type[:] = dtypes
 
+  if not name_op.attr['_output_shapes'].list.shape:
+    name_op.attr['_output_shapes'].list.shape.add()
+    name_op.attr['_output_shapes'].list.shape[0].dim.add()
   name_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(names)
+
+  if not shape_op.attr['_output_shapes'].list.shape:
+    shape_op.attr['_output_shapes'].list.shape.add()
+    shape_op.attr['_output_shapes'].list.shape[0].dim.add()
   shape_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(shapes)
 
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 55946c128b1..c2340d03776 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -739,7 +739,7 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   else:
     for include in includes:
       if include not in all_includes:
-        raise ValueError('Invaild key: %s.' % include)
+        raise ValueError('Invalid key: %s.' % include)
 
   predictions, labels, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
       predictions, labels, weights)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
index d07fece4bc6..6a3b535eb44 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
@@ -58,6 +58,7 @@ def read_cifar10(filename_queue):
 
   class CIFAR10Record(object):
     pass
+
   result = CIFAR10Record()
 
   # Dimensions of the images in the CIFAR-10 dataset.
@@ -147,8 +148,9 @@ def distorted_inputs(data_dir, batch_size):
     images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
     labels: Labels. 1D tensor of [batch_size] size.
   """
-  filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-               for i in xrange(1, 6)]
+  filenames = [
+      os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+  ]
   for f in filenames:
     if not tf.gfile.Exists(f):
       raise ValueError('Failed to find file: ' + f)
@@ -174,10 +176,9 @@ def distorted_inputs(data_dir, batch_size):
 
   # Because these operations are not commutative, consider randomizing
   # the order their operation.
-  distorted_image = tf.image.random_brightness(distorted_image,
-                                               max_delta=63)
-  distorted_image = tf.image.random_contrast(distorted_image,
-                                             lower=0.2, upper=1.8)
+  distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
+  distorted_image = tf.image.random_contrast(
+      distorted_image, lower=0.2, upper=1.8)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(distorted_image)
@@ -188,15 +189,18 @@ def distorted_inputs(data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
-                           min_fraction_of_examples_in_queue)
-  print ('Filling queue with %d CIFAR images before starting to train. '
-         'This will take a few minutes.' % min_queue_examples)
+  min_queue_examples = int(
+      NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue)
+  print('Filling queue with %d CIFAR images before starting to train. '
+        'This will take a few minutes.' % min_queue_examples)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=True)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=True)
 
 
 def inputs(eval_data, data_dir, batch_size):
@@ -212,8 +216,9 @@ def inputs(eval_data, data_dir, batch_size):
     labels: Labels. 1D tensor of [batch_size] size.
   """
   if not eval_data:
-    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-                 for i in xrange(1, 6)]
+    filenames = [
+        os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+    ]
     num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
   else:
     filenames = [os.path.join(data_dir, 'test_batch.bin')]
@@ -235,8 +240,8 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Image processing for evaluation.
   # Crop the central [height, width] of the image.
-  resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image,
-                                                         width, height)
+  resized_image = tf.image.resize_image_with_crop_or_pad(
+      reshaped_image, width, height)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(resized_image)
@@ -247,10 +252,13 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(num_examples_per_epoch *
-                           min_fraction_of_examples_in_queue)
+  min_queue_examples = int(
+      num_examples_per_epoch * min_fraction_of_examples_in_queue)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=False)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=False)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
index 73dd56398c4..660f0168b10 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
@@ -48,7 +48,7 @@ from tensorflow.contrib.model_pruning.python import pruning
 # Global constants describing the CIFAR-10 data set.
 IMAGE_SIZE = cifar10_input.IMAGE_SIZE
 NUM_CLASSES = cifar10_input.NUM_CLASSES
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN  # pylint: disable=line-too-long
 NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
 BATCH_SIZE = 128
 DATA_DIR = '/tmp/cifar10_data'
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index 8d14a3ef040..6a7f5efecdb 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 
 namespace tensorflow {
 
@@ -62,7 +62,6 @@ BaseRemoteRendezvous* MPIRendezvousMgr::Create(int64 step_id,
 void MPIRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
-
   Status s = Status::OK();
   MPIRequestTensorCall* rendezvous_call = new MPIRequestTensorCall();
 
@@ -103,37 +102,37 @@ void MPIRemoteRendezvous::RecvFromRemoteAsync(
   // Create the function which is called when the Tensor is send by remote
   const int64 temp1 = step_id_;
   rendezvous_call->recv_call_ =
-      [this, parsed, recv_args, done, dst, temp1, rendezvous_call](
-          MPIRecvTensorResponse mpi_response) {
-    Status s;
-    Device* dst_device;
-    if (s.ok()) {
-      s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
-      CHECK(s.ok()) << "Device lookup failed";
-    }
+      [this, parsed, recv_args, done, dst, temp1,
+       rendezvous_call](MPIRecvTensorResponse mpi_response) {
+        Status s;
+        Device* dst_device;
+        if (s.ok()) {
+          s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+          CHECK(s.ok()) << "Device lookup failed";
+        }
 
-    VLOG(3) << "MPI Received tensor " << parsed.FullKey()
-            << " @ step: " << temp1
-            << " single-send: " << mpi_response.singlesend();
+        VLOG(3) << "MPI Received tensor " << parsed.FullKey()
+                << " @ step: " << temp1
+                << " single-send: " << mpi_response.singlesend();
 
-    Tensor val;
-    if (mpi_response.singlesend()) {
-      dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
-                                      recv_args.alloc_attrs, &val);
-    } else {
-      TensorResponse tr;
-      tr.InitAlloc(dst_device, recv_args.alloc_attrs);
-      tr.InitPartial(mpi_response.response());
-      const size_t nBytes = tr.tensor().TotalBytes();
-      void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
-      MPI_Status status;
-      MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
-                         TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
-      val = std::move(tr.tensor());
-    }
+        Tensor val;
+        if (mpi_response.singlesend()) {
+          dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
+                                          recv_args.alloc_attrs, &val);
+        } else {
+          TensorResponse tr;
+          tr.InitAlloc(dst_device, recv_args.alloc_attrs);
+          tr.InitPartial(mpi_response.response());
+          const size_t nBytes = tr.tensor().TotalBytes();
+          void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
+          MPI_Status status;
+          MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
+                             TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
+          val = std::move(tr.tensor());
+        }
 
-    done(s, Args(), recv_args, val, mpi_response.response().is_dead());
-  };
+        done(s, Args(), recv_args, val, mpi_response.response().is_dead());
+      };
 
   MPIRendezvousMgr* mgr =
       reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
@@ -152,16 +151,18 @@ MPIRemoteRendezvous::~MPIRemoteRendezvous() {}
 void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
                                   const int mpi_dst) {
   TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique(
-      req.request_id(), "RecvTensor (MPIRendezvousMgr)", req));
+      request.request_id(), "RecvTensor (MPIRendezvousMgr)", request));
   const int64 step_id = request.step_id();
   const std::string& key = request.rendezvous_key();
   Rendezvous::ParsedKey parsed;
   TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
 
   MPIRecvTensorCallBack send_cb = [this, mpi_dst, parsed](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead,
-      MPISendTensorCall* mpi_send_call) {
+                                      const Status& status,
+                                      const Rendezvous::Args& send_args,
+                                      const Rendezvous::Args& recv_args,
+                                      const Tensor& val, bool is_dead,
+                                      MPISendTensorCall* mpi_send_call) {
     // TODO(jbedorf) this should be a loop over max size
     CHECK(mpi_send_call->mRes_.ByteSize() < INT_MAX)
         << "Buffer too large for single transfer";
@@ -194,74 +195,78 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
   };
 
   // Wrapper around the read callback to place the callback on our queue
-  Rendezvous::DoneCallback done_cb = [this, parsed, step_id, send_cb](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
-    if (!status.ok()) {
-      CHECK(status.ok()) << "RecvLocalAsync was not ok, key: "
-                         << parsed.FullKey() << " step: " << step_id
-                         << " error message: " << status.error_message();
-      return;
-    }
+  Rendezvous::DoneCallback done_cb =
+      [this, parsed, step_id, send_cb](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
+        if (!status.ok()) {
+          CHECK(status.ok())
+              << "RecvLocalAsync was not ok, key: " << parsed.FullKey()
+              << " step: " << step_id
+              << " error message: " << status.error_message();
+          return;
+        }
 
-    VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
-            << " @ step: " << step_id << std::endl;
+        VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
+                << " @ step: " << step_id << std::endl;
 
-    auto mpi_send_call = new MPISendTensorCall();
-    mpi_send_call->Init(parsed, step_id, is_dead);
+        auto mpi_send_call = new MPISendTensorCall();
+        mpi_send_call->Init(parsed, step_id, is_dead);
 
-    Device* src_dev = nullptr;
-    Status s = this->worker_env_2->device_mgr->LookupDevice(parsed.src_device,
-                                                            &src_dev);
-    CHECK(s.ok()) << "src device not found";
+        Device* src_dev = nullptr;
+        Status s = this->worker_env_2->device_mgr->LookupDevice(
+            parsed.src_device, &src_dev);
+        CHECK(s.ok()) << "src device not found";
 
-    // Control if shape and data should be send together or if we can optimize
-    // it in two different transfers, thereby reducing memory copies
-    bool doOptimalTransfer = true;
-    if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
-    if (val.TotalBytes() < 1024) doOptimalTransfer = false;
+        // Control if shape and data should be send together or if we can
+        // optimize it in two different transfers, thereby reducing memory
+        // copies
+        bool doOptimalTransfer = true;
+        if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
+        if (val.TotalBytes() < 1024) doOptimalTransfer = false;
 
-    doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
+        doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
 
-    if (doOptimalTransfer) {
-      // First send the Tensor description and in a follow up transfer the data
-      mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
-          val.dtype());
-      val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
-                              ->mutable_tensor()
-                              ->mutable_tensor_shape());
-      mpi_send_call->mRes_.set_singlesend(false);
-    } else {
-      // Send the Tensor description and data in a single transfer
-      if (src_dev->tensorflow_gpu_device_info() &&
-          (!send_args.alloc_attrs.on_host())) {
-        Notification n;
-        GPUUtil::SetProtoFromGPU(
-            val, src_dev, send_args.device_context,
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor(), is_dead,
-            [&n, &s](const Status& s_) {
-              s = s_;
-              n.Notify();
-            });
-        n.WaitForNotification();
-      } else {
-        val.AsProtoTensorContent(
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor());
-      }
-    }
+        if (doOptimalTransfer) {
+          // First send the Tensor description and in a follow up transfer the
+          // data
+          mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
+              val.dtype());
+          val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
+                                  ->mutable_tensor()
+                                  ->mutable_tensor_shape());
+          mpi_send_call->mRes_.set_singlesend(false);
+        } else {
+          // Send the Tensor description and data in a single transfer
+          if (src_dev->tensorflow_gpu_device_info() &&
+              (!send_args.alloc_attrs.on_host())) {
+            Notification n;
+            GPUUtil::SetProtoFromGPU(
+                val, src_dev, send_args.device_context,
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor(),
+                is_dead, [&n, &s](const Status& s_) {
+                  s = s_;
+                  n.Notify();
+                });
+            n.WaitForNotification();
+          } else {
+            val.AsProtoTensorContent(
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor());
+          }
+        }
 
-    std::function<MPISendTensorCall*()> res = std::bind(
-        send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
+        std::function<MPISendTensorCall*()> res = std::bind(
+            send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
 
-    SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
+        SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
 
-    this->QueueSendRequest(req);
+        this->QueueSendRequest(req);
 
-    // Wait for the notification that indicates the tensor has been
-    // successfully transmitted to the remote process. Only needed if we
-    // have not parsed the tensor to proto
-    if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
-  };  // done_cb
+        // Wait for the notification that indicates the tensor has been
+        // successfully transmitted to the remote process. Only needed if we
+        // have not parsed the tensor to proto
+        if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
+      };  // done_cb
 
   worker_env_2->compute_pool->Schedule([this, step_id, parsed, done_cb]() {
     this->RecvLocalAsync(step_id, parsed, done_cb);
@@ -293,9 +298,8 @@ void MPIRendezvousMgr::MPIBackgroundThread() {
     }
 
     // Remove sends that have been completed
-    active_sends.remove_if([](std::unique_ptr<MPISendTensorCall>& i) {
-      return i->IsFinished();
-    });
+    active_sends.remove_if(
+        [](std::unique_ptr<MPISendTensorCall>& i) { return i->IsFinished(); });
 
     // send a Tensor request
     RequestQueueEntry req;
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
index ca42ee2f6d2..5596601ddb9 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <queue>
-#include <thread>
 #include <list>
-#include <string>
-#include <memory>
 #include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <thread>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/contrib/mpi/mpi_msg.pb.h"
 #include "tensorflow/contrib/mpi/mpi_utils.h"
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -160,7 +161,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
  private:
   typedef std::function<MPISendTensorCall*(
       const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
-      const Tensor&, const bool, MPISendTensorCall*)> MPIRecvTensorCallBack;
+      const Tensor&, const bool, MPISendTensorCall*)>
+      MPIRecvTensorCallBack;
 
   typedef std::pair<std::string, std::function<void()>> RequestQueueEntry;
   typedef std::pair<std::string, std::function<MPISendTensorCall*()>>
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
index d585c0565eb..a31fa9ce0b3 100644
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "grpc/support/alloc.h"
 
-#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
index 45e21f2b25a..fa297c28cb4 100644
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <string>
 #include <map>
+#include <string>
 #include <vector>
 
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
index 2d5b98022c3..8dca90a1e34 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
@@ -35,8 +35,8 @@ limitations under the License.
 
 #define OMPI_SKIP_MPICXX
 #include "third_party/mpi/mpi.h"
-#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
 #include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
+#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
 
 /*
  * MPI Allreduce and Allgather Ops for TensorFlow.
diff --git a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
index f0a116239d6..2fbefef0d36 100644
--- a/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
+++ b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
@@ -26,7 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 
 _mpi_ops_so = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_mpi_ops.so"))
+    resource_loader.get_path_to_datafile('_mpi_ops.so'))
+
 
 def size(name=None):
   """An op which returns the number of MPI processes.
@@ -120,15 +121,14 @@ def allgather(tensor, name=None):
   """
   # Specify that first allgather is to collect the tensor gather sizes,
   # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
-  my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
+  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
+  my_size = tf.slice(
+      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
   if name is None:
-    name = "allgather"
-  sizing_name = "{}_sizing".format(name)
+    name = 'allgather'
+  sizing_name = '{}_sizing'.format(name)
   sizes = gen_mpi_ops.mpi_allgather(my_size, sizes_flag, name=sizing_name)
   return gen_mpi_ops.mpi_allgather(tensor, sizes, name=name)
 
 
 ops.NotDifferentiable('MPIAllgather')
-
-
diff --git a/tensorflow/contrib/ndlstm/__init__.py b/tensorflow/contrib/ndlstm/__init__.py
index 52e83069cb0..da89bb4ab60 100644
--- a/tensorflow/contrib/ndlstm/__init__.py
+++ b/tensorflow/contrib/ndlstm/__init__.py
@@ -16,3 +16,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+from tensorflow.contrib.ndlstm.python import lstm2d
+from tensorflow.contrib.ndlstm.python import lstm1d
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
index b24e332e4ae..2e2e9086c00 100644
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ b/tensorflow/contrib/ndlstm/python/lstm1d.py
@@ -88,7 +88,7 @@ def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
     if reverse:
       inputs = array_ops.reverse_v2(inputs, [0])
     outputs, _ = rnn.dynamic_rnn(
-      lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
+        lstm_cell, inputs, time_major=True, dtype=inputs.dtype)
     if reverse:
       outputs = array_ops.reverse_v2(outputs, [0])
     return outputs
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/heap.h b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
index 32925569a82..a2dbb8052bf 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/heap.h
+++ b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
@@ -56,7 +56,7 @@ class HeapBase {
 
   // This method adds an element at the end of the internal array without
   // "heapifying" the array afterwards. This is useful for setting up a heap
-  // where a single call to heapify at the end of the inital insertion
+  // where a single call to heapify at the end of the initial insertion
   // operations suffices.
   void InsertUnsorted(const KeyType& key, const DataType& data) {
     if (v_.size() == static_cast<size_t>(num_elements_)) {
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
index 2b412fac9a6..13db6f62f52 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
@@ -75,7 +75,8 @@ class HyperplaneLSHProbesOp : public OpKernel {
                                 num_hyperplanes_per_table, "."));
     OP_REQUIRES(context, num_hyperplanes_per_table <= 30,
                 InvalidArgument("Need num_hyperplanes_per_table <= 30, got ",
-                                num_hyperplanes_per_table, ". "
+                                num_hyperplanes_per_table,
+                                ". "
                                 "If you need more hyperplanes, change this Op"
                                 " to work for larger integer types (int64)."));
 
@@ -88,12 +89,13 @@ class HyperplaneLSHProbesOp : public OpKernel {
                 InvalidArgument("num_probes must be at least 1."));
 
     int expected_num_hyperplanes = num_tables * num_hyperplanes_per_table;
-    OP_REQUIRES(
-        context, products_tensor.dim_size(1) == expected_num_hyperplanes,
-        InvalidArgument("Expected number of hyperplanes is ",
-                        expected_num_hyperplanes, " but received ",
-                        products_tensor.dim_size(1), " inner products per "
-                        "point."));
+    OP_REQUIRES(context,
+                products_tensor.dim_size(1) == expected_num_hyperplanes,
+                InvalidArgument("Expected number of hyperplanes is ",
+                                expected_num_hyperplanes, " but received ",
+                                products_tensor.dim_size(1),
+                                " inner products per "
+                                "point."));
 
     auto products_eigen_tensor = products_tensor.matrix<CoordinateType>();
     ConstMatrixMap products_matrix(products_eigen_tensor.data(),
@@ -116,13 +118,11 @@ class HyperplaneLSHProbesOp : public OpKernel {
     // lschmidt's workstation.
     int64 cost_per_unit = 21 * num_hyperplanes_per_table * num_tables;
     if (num_probes > num_tables) {
-      cost_per_unit += 110 * num_hyperplanes_per_table
-          * (num_probes - num_tables);
+      cost_per_unit +=
+          110 * num_hyperplanes_per_table * (num_probes - num_tables);
     }
     context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
-        batch_size,
-        cost_per_unit,
-        [&](int64 start, int64 end) {
+        batch_size, cost_per_unit, [&](int64 start, int64 end) {
           HyperplaneMultiprobe<CoordinateType, int32> multiprobe(
               num_hyperplanes_per_table, num_tables);
 
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index f243317f1df..82ebca7f203 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -397,10 +397,6 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
             'automatically and cannot be injected manually'.format(kwarg))
 
     minimize_kwargs.update(optimizer_kwargs)
-    if method == 'SLSQP':
-      # SLSQP doesn't support step callbacks. Obviate associated warning
-      # message.
-      del minimize_kwargs['callback']
 
     import scipy.optimize  # pylint: disable=g-import-not-at-top
     result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 0f597d0a246..953586ee70c 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -299,6 +299,45 @@ class ScipyOptimizerInterfaceTest(TestCase):
       method = optimizer.optimizer_kwargs.get('method')
       self.assertEqual('SLSQP', method)
 
+  def test_callbacks(self):
+    vector_val = np.array([7., -2.], dtype=np.float32)
+    vector = variables.Variable(vector_val, 'vector')
+
+    minimum_location_val = np.arange(2)
+    minimum_location = constant_op.constant(
+        minimum_location_val, dtype=dtypes.float32)
+
+    loss = math_ops.reduce_sum(math_ops.square(vector - minimum_location)) / 2.
+    loss_val_first = ((vector_val - minimum_location_val)**2).sum() / 2.
+
+    optimizer = external_optimizer.ScipyOptimizerInterface(loss, method='SLSQP')
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      initial_vector_val = sess.run(vector)
+
+      extra_fetches = [loss]
+
+      step_callback = test.mock.Mock()
+      loss_callback = test.mock.Mock()
+
+      optimizer.minimize(
+          sess,
+          fetches=extra_fetches,
+          loss_callback=loss_callback,
+          step_callback=step_callback)
+
+      loss_val_last = sess.run(loss)
+
+      call_first = test.mock.call(loss_val_first)
+      call_last = test.mock.call(loss_val_last)
+      loss_calls = [call_first, call_last]
+      loss_callback.assert_has_calls(loss_calls, any_order=True)
+
+      args, _ = step_callback.call_args
+      self.assertAllClose(minimum_location_val, args[0])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index 9cee405cef2..e18923c8aae 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -14,13 +14,12 @@
 // limitations under the License.
 // =============================================================================
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
-REGISTER_KERNEL_BUILDER(Name("PeriodicResample")
-                            .Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index ba410f025d4..3ab588c4588 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -118,9 +118,9 @@ template <class InputDataT,
           tensorflow::errors::InvalidArgument(
               "periodic_resample expects the size of non-adjustable "
               "dimensions be at least as large as size of input tensor."
-              " Dimension ", i, " input tensor has size ",
-              input_tensor_shape.dim_size(i), ", desired shape has size ",
-              desired_shape[i], "."));
+              " Dimension ",
+              i, " input tensor has size ", input_tensor_shape.dim_size(i),
+              ", desired shape has size ", desired_shape[i], "."));
 
       // target_dimensions[i] = desired_shape(i);
       target_dimensions[i] = desired_shape[i];
diff --git a/tensorflow/contrib/pi_examples/camera/camera.cc b/tensorflow/contrib/pi_examples/camera/camera.cc
index cb206616629..8110185ea8d 100644
--- a/tensorflow/contrib/pi_examples/camera/camera.cc
+++ b/tensorflow/contrib/pi_examples/camera/camera.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #include <errno.h>
 #include <fcntl.h>
-#include <fstream>
 #include <libv4l2.h>
 #include <linux/videodev2.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/time.h>
 #include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fstream>
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Used to store the memory-mapped buffers we use for capture.
 struct CameraBuffer {
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index 0b18045789f..c6935a093f7 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -23,9 +23,9 @@ limitations under the License.
 //
 // Full build instructions are at tensorflow/contrib/pi_examples/README.md.
 
-#include <stdio.h>
 #include <jpeglib.h>
 #include <setjmp.h>
+#include <stdio.h>
 #include <fstream>
 #include <vector>
 
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
@@ -77,23 +77,22 @@ Status ReadLabelsFile(string file_name, std::vector<string>* result,
 // Error handling for JPEG decoding.
 void CatchError(j_common_ptr cinfo) {
   (*cinfo->err->output_message)(cinfo);
-  jmp_buf *jpeg_jmpbuf = reinterpret_cast<jmp_buf *>(cinfo->client_data);
+  jmp_buf* jpeg_jmpbuf = reinterpret_cast<jmp_buf*>(cinfo->client_data);
   jpeg_destroy(cinfo);
   longjmp(*jpeg_jmpbuf, 1);
 }
 
 // Decompresses a JPEG file from disk.
 Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
-		    int* width, int* height, int* channels) {
+                    int* width, int* height, int* channels) {
   struct jpeg_decompress_struct cinfo;
-  FILE * infile;
+  FILE* infile;
   JSAMPARRAY buffer;
   int row_stride;
 
   if ((infile = fopen(file_name.c_str(), "rb")) == NULL) {
     LOG(ERROR) << "Can't open " << file_name;
-    return tensorflow::errors::NotFound("JPEG file ", file_name,
-					" not found");
+    return tensorflow::errors::NotFound("JPEG file ", file_name, " not found");
   }
 
   struct jpeg_error_mgr jerr;
@@ -116,10 +115,11 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   data->resize((*height) * (*width) * (*channels));
 
   row_stride = cinfo.output_width * cinfo.output_components;
-  buffer = (*cinfo.mem->alloc_sarray)
-    ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
+  buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE,
+                                      row_stride, 1);
   while (cinfo.output_scanline < cinfo.output_height) {
-    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]);
+    tensorflow::uint8* row_address =
+        &((*data)[cinfo.output_scanline * row_stride]);
     jpeg_read_scanlines(&cinfo, buffer, 1);
     memcpy(row_address, buffer[0], row_stride);
   }
@@ -141,24 +141,25 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
   int image_height;
   int image_channels;
   TF_RETURN_IF_ERROR(LoadJpegFile(file_name, &image_data, &image_width,
-				  &image_height, &image_channels));
-  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height
-	    << "x" << image_channels;
+                                  &image_height, &image_channels));
+  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height << "x"
+            << image_channels;
   const int wanted_channels = 3;
   if (image_channels < wanted_channels) {
-    return tensorflow::errors::FailedPrecondition("Image needs to have at least ",
-						  wanted_channels, " but only has ",
-						  image_channels);
+    return tensorflow::errors::FailedPrecondition(
+        "Image needs to have at least ", wanted_channels, " but only has ",
+        image_channels);
   }
-  // In these loops, we convert the eight-bit data in the image into float, resize
-  // it using bilinear filtering, and scale it numerically to the float range that
-  // the model expects (given by input_mean and input_std).
+  // In these loops, we convert the eight-bit data in the image into float,
+  // resize it using bilinear filtering, and scale it numerically to the float
+  // range that the model expects (given by input_mean and input_std).
   tensorflow::Tensor image_tensor(
-      tensorflow::DT_FLOAT, tensorflow::TensorShape(
-      {1, wanted_height, wanted_width, wanted_channels}));
+      tensorflow::DT_FLOAT,
+      tensorflow::TensorShape(
+          {1, wanted_height, wanted_width, wanted_channels}));
   auto image_tensor_mapped = image_tensor.tensor<float, 4>();
   tensorflow::uint8* in = image_data.data();
-  float *out = image_tensor_mapped.data();
+  float* out = image_tensor_mapped.data();
   const size_t image_rowlen = image_width * image_channels;
   const float width_scale = static_cast<float>(image_width) / wanted_width;
   const float height_scale = static_cast<float>(image_height) / wanted_height;
@@ -166,35 +167,37 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
     const float in_y = y * height_scale;
     const int top_y_index = static_cast<int>(floorf(in_y));
     const int bottom_y_index =
-      std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
+        std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
     const float y_lerp = in_y - top_y_index;
     tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
     tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
-    float *out_row = out + (y * wanted_width * wanted_channels);
+    float* out_row = out + (y * wanted_width * wanted_channels);
     for (int x = 0; x < wanted_width; ++x) {
       const float in_x = x * width_scale;
       const int left_x_index = static_cast<int>(floorf(in_x));
       const int right_x_index =
-	std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
+          std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
       tensorflow::uint8* in_top_left_pixel =
-	in_top_row + (left_x_index * wanted_channels);
+          in_top_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_top_right_pixel =
-	in_top_row + (right_x_index * wanted_channels);
+          in_top_row + (right_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_left_pixel =
-	in_bottom_row + (left_x_index * wanted_channels);
+          in_bottom_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_right_pixel =
-	in_bottom_row + (right_x_index * wanted_channels);
+          in_bottom_row + (right_x_index * wanted_channels);
       const float x_lerp = in_x - left_x_index;
-      float *out_pixel = out_row + (x * wanted_channels);
+      float* out_pixel = out_row + (x * wanted_channels);
       for (int c = 0; c < wanted_channels; ++c) {
-	const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
-	const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
-	const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std);
-	const float bottom_right((in_bottom_right_pixel[c] - input_mean) / input_std);
-	const float top = top_left + (top_right - top_left) * x_lerp;
-	const float bottom =
-	  bottom_left + (bottom_right - bottom_left) * x_lerp;
-	out_pixel[c] = top + (bottom - top) * y_lerp;
+        const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
+        const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
+        const float bottom_left((in_bottom_left_pixel[c] - input_mean) /
+                                input_std);
+        const float bottom_right((in_bottom_right_pixel[c] - input_mean) /
+                                 input_std);
+        const float top = top_left + (top_right - top_left) * x_lerp;
+        const float bottom =
+            bottom_left + (bottom_right - bottom_left) * x_lerp;
+        out_pixel[c] = top + (bottom - top) * y_lerp;
       }
     }
   }
@@ -233,10 +236,10 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
     scores.push_back(std::pair<int, float>({i, unsorted_scores_flat(i)}));
   }
   std::sort(scores.begin(), scores.end(),
-	    [](const std::pair<int, float> &left,
-	       const std::pair<int, float> &right) {
-	      return left.second > right.second;
-	    });
+            [](const std::pair<int, float>& left,
+               const std::pair<int, float>& right) {
+              return left.second > right.second;
+            });
   scores.resize(how_many_labels);
   Tensor sorted_indices(tensorflow::DT_INT32, {scores.size()});
   Tensor sorted_scores(tensorflow::DT_FLOAT, {scores.size()});
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
index d395de986d2..479ea9becae 100644
--- a/tensorflow/contrib/py2tf/BUILD
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -18,66 +18,13 @@ py_library(
     name = "py2tf",
     srcs = [
         "__init__.py",
-        "api.py",
-        "config.py",
-        "conversion.py",
-        "naming.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/py2tf/converters",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/impl",
+        "//tensorflow/contrib/py2tf/utils",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
 )
-
-# Separate target that allows access to internal symbols for testing.
-py_library(
-    name = "py2tf_internal",
-    srcs = [
-        "api.py",
-        "config.py",
-        "conversion.py",
-        "naming.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow/contrib/py2tf/converters",
-        "//tensorflow/contrib/py2tf/pyct",
-        "//tensorflow/contrib/py2tf/pyct/static_analysis",
-        "@gast_archive//:gast",
-        "@six_archive//:six",
-    ],
-)
-
-py_test(
-    name = "api_test",
-    srcs = ["api_test.py"],
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "conversion_test",
-    srcs = ["conversion_test.py"],
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-        "@gast_archive//:gast",
-    ],
-)
-
-py_test(
-    name = "naming_test",
-    srcs = ["naming_test.py"],
-    deps = [
-        ":py2tf_internal",
-        "//tensorflow/python:client_testlib",
-    ],
-)
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py
index d187da99e06..0d51bf0bf24 100644
--- a/tensorflow/contrib/py2tf/__init__.py
+++ b/tensorflow/contrib/py2tf/__init__.py
@@ -21,11 +21,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf.api import to_code
-from tensorflow.contrib.py2tf.api import to_graph
+from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.py2tf.impl.api import convert
+from tensorflow.contrib.py2tf.impl.api import graph_ready
+from tensorflow.contrib.py2tf.impl.api import to_code
+from tensorflow.contrib.py2tf.impl.api import to_graph
 from tensorflow.python.util.all_util import remove_undocumented
 
-
-_allowed_symbols = ['to_graph', 'to_code']
+_allowed_symbols = ['to_graph', 'to_code', 'convert', 'graph_ready', 'utils']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 2b0a1234e69..03ded49022c 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -17,6 +17,7 @@ filegroup(
 py_library(
     name = "converters",
     srcs = [
+        "asserts.py",
         "break_canonicalization.py",
         "builtin_functions.py",
         "call_trees.py",
@@ -45,13 +46,26 @@ py_library(
     deps = [
         ":converters",
         "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/utils",
         "@gast_archive//:gast",
     ],
 )
 
+py_test(
+    name = "asserts_test",
+    srcs = ["asserts_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "break_canonicalization_test",
     srcs = ["break_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -62,6 +76,18 @@ py_test(
 py_test(
     name = "call_trees_test",
     srcs = ["call_trees_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "decorators_test",
+    srcs = ["decorators_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -72,6 +98,7 @@ py_test(
 py_test(
     name = "continue_canonicalization_test",
     srcs = ["continue_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -82,6 +109,7 @@ py_test(
 py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -92,6 +120,7 @@ py_test(
 py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -102,6 +131,8 @@ py_test(
 py_test(
     name = "for_canonicalization_test",
     srcs = ["for_canonicalization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["nomac"],
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -112,6 +143,7 @@ py_test(
 py_test(
     name = "logical_expressions_test",
     srcs = ["logical_expressions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -122,6 +154,7 @@ py_test(
 py_test(
     name = "print_functions_test",
     srcs = ["print_functions_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
@@ -133,6 +166,7 @@ py_test(
 py_test(
     name = "side_effect_guards_test",
     srcs = ["side_effect_guards_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/pyct",
diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/py2tf/converters/asserts.py
new file mode 100644
index 00000000000..2d6ee1d0982
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts.py
@@ -0,0 +1,53 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts Assert statements to their corresponding TF calls."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+
+
+class AssertsTransformer(transformer.Base):
+  """Transforms Print nodes to Call so they can be handled as functions."""
+
+  # pylint:disable=invalid-name
+
+  def visit_Assert(self, node):
+    self.generic_visit(node)
+
+    # Note: The lone tf.Assert call will be wrapped with control_dependencies
+    # by side_effect_guards.
+    template = """
+      tf.Assert(test, [tf.constant(msg)])
+    """
+
+    if node.msg is None:
+      return templates.replace(
+          template, test=node.test, msg=gast.Str('Assertion error'))
+    elif isinstance(node.msg, gast.Str):
+      return templates.replace(template, test=node.test, msg=node.msg)
+    else:
+      raise NotImplementedError('Can only convert string messages for now.')
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context):
+  return AssertsTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/asserts_test.py b/tensorflow/contrib/py2tf/converters/asserts_test.py
new file mode 100644
index 00000000000..6611f2777a9
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts_test.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for asserts module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.converters import asserts
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.platform import test
+
+
+class AssertsTest(converter_test_base.TestCase):
+
+  def test_transform(self):
+
+    def test_fn(a):
+      assert a > 0
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = asserts.transform(node, self.ctx)
+
+    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
index 0aae030450a..4c238b7fb9e 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees.py
@@ -29,46 +29,46 @@ import gast
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.python.util import tf_inspect
 
 
 class FunctionNamer(object):
   """Describes the interface for CallTreeTransformer's namer."""
 
   def compiled_function_name(self,
-                             original_name,
-                             live_object=None,
+                             original_fqn,
+                             live_entity=None,
                              owner_type=None):
     """Generate the name corresponding to the compiled version of a function.
 
     Args:
-      original_name: String
-      live_object: Callable, the actual target function, if known.
+      original_fqn: string or tuple(string)
+      live_entity: Callable, the actual target function, if known.
       owner_type: Optional object. If present, it indicates that the function is
           a member of the given type.
     Returns:
-      String.
+      string, bool
     """
     raise NotImplementedError()
 
-  def compiled_class_name(self, original_name, live_object=None):
+  def compiled_class_name(self, original_fqn, live_entity=None):
     """Generate the name corresponding to the compiled version of a class.
 
     Args:
-      original_name: String
-      live_object: The actual target class, if known.
+      original_fqn: string or tuple(string)
+      live_entity: The actual target class, if known.
     Returns:
-      String.
+      string
     """
     raise NotImplementedError()
 
 
-class CallTreeTransformer(gast.NodeTransformer):
+class CallTreeTransformer(transformer.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
-  def __init__(self, namer, namespace, uncompiled_modules,
-               nocompile_decorators):
-    self.namer = namer
-    self.namespace = namespace
+  def __init__(self, context, uncompiled_modules, nocompile_decorators):
+    super(CallTreeTransformer, self).__init__(context)
     self.uncompiled_modules = uncompiled_modules
     self.nocompile_decorators = nocompile_decorators
 
@@ -78,7 +78,7 @@ class CallTreeTransformer(gast.NodeTransformer):
     if isinstance(node, gast.Call):
       return self._resolve_name(node.func)
     if isinstance(node, gast.Name):
-      return self.namespace.get(node.id)
+      return self.context.namespace.get(node.id)
     if isinstance(node, gast.Attribute):
       parent = self._resolve_name(node.value)
       if parent is not None:
@@ -91,8 +91,12 @@ class CallTreeTransformer(gast.NodeTransformer):
     if anno.hasanno(node, 'live_val'):
       return anno.getanno(node, 'live_val')
     if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'):
-      member = getattr(anno.getanno(node, 'type'), node.attr)
-      return member
+      owner_type = anno.getanno(node, 'type')
+      if hasattr(owner_type, node.attr):
+        return getattr(owner_type, node.attr)
+      else:
+        raise ValueError('Type "%s" has not attribute "%s". Is it dynamic?' %
+                         (owner_type, node.attr))
     return None
 
   def _should_compile(self, node, fqn):
@@ -106,14 +110,14 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     # The decorators themselves are not to be converted.
     # If present, the decorators should appear as static functions.
-    target_obj = self._try_resolve_target(node.func)
-    if target_obj is not None:
+    target_entity = self._try_resolve_target(node.func)
+    if target_entity is not None:
       # This attribute is set by the decorator itself.
       # TODO(mdan): This may not play nicely with other wrapping decorators.
-      if hasattr(target_obj, '__pyct_is_compile_decorator'):
+      if hasattr(target_entity, '__pyct_is_compile_decorator'):
         return False
 
-      if target_obj in self.nocompile_decorators:
+      if target_entity in self.nocompile_decorators:
         return False
 
       # Inspect the target function decorators. If any include a @convert
@@ -122,7 +126,8 @@ class CallTreeTransformer(gast.NodeTransformer):
       # To parse and re-analize each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
-        target_node = parser.parse_object(target_obj).body[0]
+        target_node, _ = parser.parse_entity(target_entity)
+        target_node = target_node.body[0]
       except TypeError:
         # Functions whose source we cannot access are compilable (e.g. wrapped
         # to py_func).
@@ -136,48 +141,57 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     return True
 
+  def _determine_function_owner(self, m):
+    # TODO(mdan): The parent type should be known at analysis. Use that instead.
+    if hasattr(m, 'im_class'):  # Python 2
+      return m.im_class
+    if hasattr(m, '__qualname__'):  # Python 3
+      # Object attributes: should be bound to "self".
+      if hasattr(m, '__self__'):
+        return type(m.__self__)
+
+      # Class attributes: should have the owner name in their namespace.
+      qn = m.__qualname__.split('.')
+      if len(qn) < 2:
+        return None
+      owner_name, func_name = qn[-2:]
+      if func_name != m.__name__:
+        raise ValueError('Inconsistent names detected '
+                         '(__qualname__[1] = "%s", __name__ = "%s") for %s.' %
+                         (func_name, m.__name__, m))
+      if owner_name == '<locals>':
+        return None
+      if owner_name not in self.context.namespace:
+        raise ValueError(
+            'Could not resolve name "%s" while analyzing %s. Namespace:\n%s' %
+            (owner_name, m, self.context.namespace))
+      return self.context.namespace[owner_name]
+    return None
+
   def _rename_compilable_function(self, node):
     assert anno.hasanno(node.func, 'live_val')
     assert anno.hasanno(node.func, 'fqn')
-    target_obj = anno.getanno(node.func, 'live_val')
+    target_entity = anno.getanno(node.func, 'live_val')
     target_fqn = anno.getanno(node.func, 'fqn')
 
     if not self._should_compile(node, target_fqn):
       return node
 
     if anno.hasanno(node, 'is_constructor'):
-      new_name = self.namer.compiled_class_name(
-          '__'.join(target_fqn), live_object=target_obj)
+      new_name = self.context.namer.compiled_class_name(
+          target_fqn, live_entity=target_entity)
+      do_rename = True
     else:
-      new_name = self.namer.compiled_function_name(
-          '__'.join(target_fqn), live_object=target_obj)
-    node.func = gast.Name(new_name, gast.Load(), None)
-    return node
+      owner_type = self._determine_function_owner(target_entity)
+      new_name, do_rename = self.context.namer.compiled_function_name(
+          target_fqn, live_entity=target_entity, owner_type=owner_type)
 
-  def _rename_member_function_of_known_type(self, node):
-    assert isinstance(node.func, gast.Attribute)
-
-    type_fqn = anno.getanno(node.func, 'type_fqn')
-    assert anno.hasanno(node.func, 'type')
-    target_type = anno.getanno(node.func, 'type')
-
-    if not self._should_compile(node, type_fqn):
-      return node
-
-    # TODO(mdan): We should not assume that the namer only needs the
-    # member function name.
-    method_name = node.func.attr
-    method_object = getattr(target_type, method_name)
-    new_name = self.namer.compiled_function_name(
-        method_name, live_object=method_object, owner_type=target_type)
-    if new_name != node.func.attr:
-      # If a member function call is renamed, then the new function is no
-      # longer bound to the target object. We then refactor the call from:
-      #   foo.bar(...)
-      # to:
-      #   renamed_foo(bar, ...)
-      # TODO(mdan): This risks causing duplication, if target_type is renamed.
-      node.args = [node.func.value] + node.args
+    if do_rename:
+      if target_entity is not None:
+        if tf_inspect.ismethod(target_entity):
+          # The renaming process will transform it into a regular function.
+          # TODO(mdan): Is this complete? How does it work with nested members?
+          node.args = [node.func.value] + node.args
       node.func = gast.Name(new_name, gast.Load(), None)
     return node
 
@@ -193,7 +207,7 @@ class CallTreeTransformer(gast.NodeTransformer):
     wrapper_def, call_expr = templates.replace(
         template,
         call=node.func,
-        wrapper=self.namer.compiled_function_name(node.func.id),
+        wrapper=self.context.namer.compiled_function_name(node.func.id)[0],
         args=tuple(gast.Name(n, gast.Load(), None) for n in args_scope.used))
     anno.setanno(call_expr.value, 'args_scope', args_scope)
     # TODO(mdan): Rename this annotation to 'graph_ready'
@@ -201,15 +215,15 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     return (wrapper_def, call_expr)
 
-  def _function_is_compilable(self, target_obj):
+  def _function_is_compilable(self, target_entity):
     # TODO(mdan): This is just a placeholder. Implement.
-    return not isinstance(target_obj, types.BuiltinFunctionType)
+    return not isinstance(target_entity, types.BuiltinFunctionType)
 
   def visit_Expr(self, node):
     if isinstance(node.value, gast.Call):
       if anno.hasanno(node.value.func, 'live_val'):
-        target_obj = anno.getanno(node.value.func, 'live_val')
-        if not self._function_is_compilable(target_obj):
+        target_entity = anno.getanno(node.value.func, 'live_val')
+        if not self._function_is_compilable(target_entity):
           if anno.hasanno(node.value.func, 'fqn'):
             target_fqn = anno.getanno(node.value.func, 'fqn')
             if not self._should_compile(node.value, target_fqn):
@@ -227,8 +241,8 @@ class CallTreeTransformer(gast.NodeTransformer):
     # If the function is wrapped by one of the marker decorators,
     # consider it graph ready.
     if anno.hasanno(node.func, 'live_val'):
-      target_obj = anno.getanno(node.func, 'live_val')
-      if target_obj in self.nocompile_decorators:
+      target_entity = anno.getanno(node.func, 'live_val')
+      if target_entity in self.nocompile_decorators:
         if len(node.args) < 1:
           raise ValueError(
               'Found call to decorator function "%s", but it had no arguments. '
@@ -237,28 +251,28 @@ class CallTreeTransformer(gast.NodeTransformer):
 
     self.generic_visit(node)
     if anno.hasanno(node.func, 'live_val'):
-      target_obj = anno.getanno(node.func, 'live_val')
-      if self._function_is_compilable(target_obj):
+      target_entity = anno.getanno(node.func, 'live_val')
+      if self._function_is_compilable(target_entity):
         node = self._rename_compilable_function(node)
       else:
         raise NotImplementedError('py_func with return values')
-    elif anno.hasanno(node.func, 'type_fqn'):
-      node = self._rename_member_function_of_known_type(node)
     else:
-      raise NotImplementedError(
-          'Member function call (of unknown type): %s.' % node.func.id)
+      if self.context.recursive:
+        raise NotImplementedError('Could not resolve target function.')
+      else:
+        # TODO(mdan): Double check. Is this reachable code?
+        pass
     return node
 
   # pylint:enable=invalid-name
 
 
-def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators):
+def transform(node, context, uncompiled_modules, nocompile_decorators):
   """Transform function call to the compiled counterparts.
 
   Args:
     node: AST to transform.
-    namer: FunctionNamer-like.
-    namespace: Dict mapping symbol names to their corresponding live objects.
+    context: An EntityContext object.
     uncompiled_modules: set of string tuples, each tuple represents the fully
         qualified name of a package containing functions that will not be
         compiled.
@@ -269,7 +283,6 @@ def transform(node, namer, namespace, uncompiled_modules, nocompile_decorators):
         node: The transformed AST
         new_names: set(string), containing any newly-generated names
   """
-  transformer = CallTreeTransformer(namer, namespace, uncompiled_modules,
-                                    nocompile_decorators)
-  node = transformer.visit(node)
+  t = CallTreeTransformer(context, uncompiled_modules, nocompile_decorators)
+  node = t.visit(node)
   return node
diff --git a/tensorflow/contrib/py2tf/converters/call_trees_test.py b/tensorflow/contrib/py2tf/converters/call_trees_test.py
index 8cb8d7be0f1..e63c10de0fe 100644
--- a/tensorflow/contrib/py2tf/converters/call_trees_test.py
+++ b/tensorflow/contrib/py2tf/converters/call_trees_test.py
@@ -28,8 +28,13 @@ from tensorflow.python.platform import test
 
 class TestNamer(call_trees.FunctionNamer):
 
-  def compiled_function_name(self, original_name, live_object=None):
-    return 'renamed_%s' % original_name
+  def compiled_function_name(self,
+                             original_fqn,
+                             live_entity=None,
+                             owner_type=None):
+    if owner_type is not None:
+      return None, False
+    return ('renamed_%s' % '_'.join(original_fqn)), True
 
 
 class CallTreesTest(converter_test_base.TestCase):
@@ -45,14 +50,35 @@ class CallTreesTest(converter_test_base.TestCase):
     def test_fn_2(a):
       return test_fn_1(a) + 1
 
-    node = self.parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1})
-    node = call_trees.transform(node, TestNamer(), {}, (), ())
+    node = self.parse_and_analyze(
+        test_fn_2, {'test_fn_1': test_fn_1}, namer=TestNamer())
+    node = call_trees.transform(node, self.ctx, (), ())
     result = compiler.ast_to_object(node)
     # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1 manually.
     setattr(result, 'renamed_test_fn_1', renamed_test_fn_1)
 
     self.assertEquals(3, result.test_fn_2(1))
 
+  def test_simple_methods(self):
+
+    class TestClass(object):
+
+      def test_fn_1(self, a):
+        return a + 1
+
+      def test_fn_2(self, a):
+        return self.test_fn_1(a) + 1
+
+    node = self.parse_and_analyze(
+        TestClass.test_fn_2, {'TestClass': TestClass},
+        namer=TestNamer(),
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    node = call_trees.transform(node, self.ctx, (), ())
+    result = compiler.ast_to_object(node)
+
+    tc = TestClass()
+    self.assertEquals(3, result.test_fn_2(tc, 1))
+
   def test_uncompiled_modules(self):
 
     def test_fn(a):
@@ -60,11 +86,13 @@ class CallTreesTest(converter_test_base.TestCase):
       a = math_ops.add(a, constant_op.constant(1))
       return a
 
-    node = self.parse_and_analyze(test_fn, {
-        'math_ops': math_ops,
-        'constant_op': constant_op
-    })
-    node = call_trees.transform(node, TestNamer(), {},
+    node = self.parse_and_analyze(
+        test_fn, {
+            'math_ops': math_ops,
+            'constant_op': constant_op
+        },
+        namer=TestNamer())
+    node = call_trees.transform(node, self.ctx,
                                 set(((math_ops.__name__,),
                                      (constant_op.__name__,))), ())
     result = compiler.ast_to_object(node)
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
index ed006bad6d8..6bfa55443c4 100644
--- a/tensorflow/contrib/py2tf/converters/converter_test_base.py
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -31,18 +31,23 @@ class TestCase(test.TestCase):
   def parse_and_analyze(self,
                         test_fn,
                         namespace,
+                        namer=None,
                         arg_types=None,
-                        include_type_analysis=True):
+                        include_type_analysis=True,
+                        recursive=True):
+    node, source = parser.parse_entity(test_fn)
     ctx = context.EntityContext(
-        namer=None,
-        source_code=None,
+        namer=namer,
+        source_code=source,
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types)
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
+        arg_types=arg_types,
+        recursive=recursive)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     if include_type_analysis:
       node = type_info.resolve(node, ctx)
+      node = live_values.resolve(node, ctx, {})
+    self.ctx = ctx
     return node
diff --git a/tensorflow/contrib/py2tf/converters/decorators.py b/tensorflow/contrib/py2tf/converters/decorators.py
index a4313bfa510..3f620c1cd2d 100644
--- a/tensorflow/contrib/py2tf/converters/decorators.py
+++ b/tensorflow/contrib/py2tf/converters/decorators.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Handles decorators."""
+"""Handles decorators.
+
+Note: this module only deals with functions whose decorators are still recorded
+in the AST. This does not always happen. See the unit test for an example.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -34,17 +38,19 @@ class DecoratorsTransformer(gast.NodeTransformer):
 
   def visit_FunctionDef(self, node):
     self.generic_visit(node)
+    kept_decorators = []
     for dec in node.decorator_list:
       if isinstance(dec, gast.Call):
-        dec = dec.func
-      if not anno.hasanno(dec, 'live_val'):
+        dec_func = dec.func
+      else:
+        dec_func = dec
+      if not anno.hasanno(dec_func, 'live_val'):
         raise ValueError(
-            'Could not resolve decorator: %s' % pretty_printer.fmt(dec))
-      dec_value = anno.getanno(dec, 'live_val')
-      if dec_value in self.remove_decorators:
-        continue
-      raise ValueError('Dont know how to convert decorators for now.')
-    node.decorator_list = []
+            'Could not resolve decorator: %s' % pretty_printer.fmt(dec_func))
+      dec_value = anno.getanno(dec_func, 'live_val')
+      if dec_value not in self.remove_decorators:
+        kept_decorators.append(dec)
+    node.decorator_list = kept_decorators
     return node
 
   # pylint:enable=invalid-name
diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/py2tf/converters/decorators_test.py
new file mode 100644
index 00000000000..f50d593043a
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/decorators_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for decorators module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import decorators
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+class DecoratorsTest(converter_test_base.TestCase):
+
+  def test_function_decorator(self):
+
+    def function_decorator():
+
+      def decorator(f):
+        return lambda a: f(a) + 1
+
+      return decorator
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them on load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @function_decorator()
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'function_decorator': function_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    result = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(function_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(function_decorator,))
+    result = compiler.ast_to_object(node)
+    self.assertEqual(1, result.test_fn(1))
+
+  def test_simple_decorator(self):
+
+    def simple_decorator(f):
+      return lambda a: f(a) + 1
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them upon load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @simple_decorator
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'simple_decorator': simple_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    result = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(simple_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(simple_decorator,))
+    result = compiler.ast_to_object(node)
+    self.assertEqual(1, result.test_fn(1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
index 4df723989d4..1eda8ae630a 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -96,6 +96,7 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
     return node
 
   def _gate_symbols(self, guard_statement, guarded_args):
+    # TODO(mdan): This won't work for variables.
     template = """
       (args,) = (tf.identity(a) for a in (args,))
     """
@@ -110,33 +111,22 @@ class SideEffectGuardTransformer(gast.NodeTransformer):
       #   opt.minimize(loss)
       # or:
       #   tf.py_func(...)
-      args_scope = anno.getanno(node.value, 'args_scope')
-      temp_name = self.namer.new_symbol('temp', args_scope.parent.referenced)
-      # TODO(mdan): Unsafe reference modification!
-      args_scope.mark_write(temp_name)
       template = """
-        temp_result = call
-        if temp_result is not None:
-          if not isinstance(temp_result, (list, tuple)):
-            temp_result = (temp_result,)
-          ctx = tf.control_dependencies(temp_result)
-        else:
-          ctx = contextmanager(lambda: (yield))()
-        with ctx:
-          # TODO(mdan): Also insert ops to re-fetch if variables are involved.
+        with py2tf_utils.control_dependency_on_returns(tf, call):
+          # TODO(mdan): Also insert ops to re-fetch if variables are involved?
           pass  # Will be removed below.
       """
       # TODO(mdan): This is brittle. Reorganize the mechanism.
-      statements = templates.replace(
-          template, call=node.value, temp_result=temp_name)
+      statements = templates.replace(template, call=node.value)
       control_deps_guard = statements[-1]
       control_deps_guard.body = []
 
       # First, attempt to gate future evaluation of args. If that's not
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
-      guarded_args = tuple(
-          n for n in args_scope.used if n in args_scope.parent.modified)
+      args_scope = anno.getanno(node.value, 'args_scope')
+      guarded_args = tuple(args_scope.used & (args_scope.parent.modified
+                                              | args_scope.parent.returned))
       if guarded_args:
         node = tuple(statements[:-1]) + (
             self._gate_symbols(control_deps_guard, guarded_args),)
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
index 5c56973dc2a..452d7ab2be8 100644
--- a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.py2tf import utils
 from tensorflow.contrib.py2tf.converters import converter_test_base
 from tensorflow.contrib.py2tf.converters import side_effect_guards
 from tensorflow.contrib.py2tf.pyct import compiler
@@ -46,6 +47,7 @@ class SideEffectGuardsTest(converter_test_base.TestCase):
     node = side_effect_guards.transform(node, TestNamer())
     result = compiler.ast_to_object(node)
     setattr(result, 'state_ops', state_ops)
+    setattr(result, 'py2tf_utils', utils)
 
     # TODO(mdan): Configure the namespaces instead of doing these hacks.
     ops.identity = array_ops.identity
diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/py2tf/impl/BUILD
new file mode 100644
index 00000000000..22f0c25cabc
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/BUILD
@@ -0,0 +1,65 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "impl",
+    srcs = [
+        "api.py",
+        "config.py",
+        "conversion.py",
+        "naming.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/contrib/py2tf/converters",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "api_test",
+    srcs = ["api_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "conversion_test",
+    srcs = ["conversion_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "naming_test",
+    srcs = ["naming_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/api.py b/tensorflow/contrib/py2tf/impl/api.py
similarity index 97%
rename from tensorflow/contrib/py2tf/api.py
rename to tensorflow/contrib/py2tf/impl/api.py
index ca1f4e2645e..85d40f31580 100644
--- a/tensorflow/contrib/py2tf/api.py
+++ b/tensorflow/contrib/py2tf/impl/api.py
@@ -23,8 +23,8 @@ from functools import wraps
 import gast
 import six
 
-from tensorflow.contrib.py2tf import config
-from tensorflow.contrib.py2tf import conversion
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import conversion
 from tensorflow.contrib.py2tf.pyct import compiler
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.util import tf_inspect
@@ -86,8 +86,8 @@ def convert_inline(f, *args, **kwargs):
 def convert(recursive=False, arg_types=None):
   """Decorator that compiles a function to graph mode.
 
-  The decorator is dynamic - invoking compilation whenever the decorated function
-  is called. This means the parameter values are known at compilation.
+  The decorator is dynamic - invoking compilation whenever the decorated
+  function is called. This means the parameter values are known at compilation.
 
   Args:
     recursive: Whether to recusrively convert any functions that the decorator
diff --git a/tensorflow/contrib/py2tf/api_test.py b/tensorflow/contrib/py2tf/impl/api_test.py
similarity index 98%
rename from tensorflow/contrib/py2tf/api_test.py
rename to tensorflow/contrib/py2tf/impl/api_test.py
index 2384447708d..dbd079a3ca6 100644
--- a/tensorflow/contrib/py2tf/api_test.py
+++ b/tensorflow/contrib/py2tf/impl/api_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import api
-from tensorflow.contrib.py2tf import config
+from tensorflow.contrib.py2tf.impl import api
+from tensorflow.contrib.py2tf.impl import config
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/py2tf/config.py b/tensorflow/contrib/py2tf/impl/config.py
similarity index 85%
rename from tensorflow/contrib/py2tf/config.py
rename to tensorflow/contrib/py2tf/impl/config.py
index 8c502a7a9e5..6525806a093 100644
--- a/tensorflow/contrib/py2tf/config.py
+++ b/tensorflow/contrib/py2tf/impl/config.py
@@ -32,7 +32,9 @@ DEFAULT_UNCOMPILED_MODULES = set((
 NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
 # TODO(mdan): Also allow controlling the generated names (for testability).
+# TODO(mdan): Verify that these names are not hidden by generated code.
+# TODO(mdan): Make sure copybara renames the reference below.
 COMPILED_IMPORT_STATEMENTS = (
-    'from contextlib import contextmanager',
     'import tensorflow as tf',
-)
+    'from tensorflow.contrib.py2tf import utils as '
+    'py2tf_utils')
diff --git a/tensorflow/contrib/py2tf/conversion.py b/tensorflow/contrib/py2tf/impl/conversion.py
similarity index 91%
rename from tensorflow/contrib/py2tf/conversion.py
rename to tensorflow/contrib/py2tf/impl/conversion.py
index b484eebbd58..ed71ff5c063 100644
--- a/tensorflow/contrib/py2tf/conversion.py
+++ b/tensorflow/contrib/py2tf/impl/conversion.py
@@ -21,8 +21,7 @@ from __future__ import print_function
 import gast
 import six
 
-from tensorflow.contrib.py2tf import config
-from tensorflow.contrib.py2tf import naming
+from tensorflow.contrib.py2tf.converters import asserts
 from tensorflow.contrib.py2tf.converters import break_canonicalization
 from tensorflow.contrib.py2tf.converters import builtin_functions
 from tensorflow.contrib.py2tf.converters import call_trees
@@ -33,6 +32,8 @@ from tensorflow.contrib.py2tf.converters import for_canonicalization
 from tensorflow.contrib.py2tf.converters import logical_expressions
 from tensorflow.contrib.py2tf.converters import print_functions
 from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import naming
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
@@ -171,7 +172,8 @@ def class_to_graph(c, conversion_map):
 def function_to_graph(f, conversion_map, arg_values, arg_types,
                       owner_type=None):
   """Specialization of `entity_to_graph` for callable functions."""
-  node = parser.parse_object(f).body[0]
+  node, source = parser.parse_entity(f)
+  node = node.body[0]
   namespace = six.get_function_globals(f)
 
   # This is needed for non-global functions.
@@ -185,28 +187,29 @@ def function_to_graph(f, conversion_map, arg_values, arg_types,
   namer = conversion_map.new_namer(namespace)
   ctx = context.EntityContext(
       namer=namer,
-      source_code=tf_inspect.getsource(f),
-      source_file=tf_inspect.getfile(f),
+      source_code=source,
+      source_file='<fragment>',
       namespace=namespace,
       arg_values=arg_values,
-      arg_types=arg_types)
+      arg_types=arg_types,
+      recursive=conversion_map.recursive)
   node = node_to_graph(node, ctx, conversion_map.nocompile_decorators)
 
-  # Simulate a rename to ensure the top level is in the name map. This is needed
-  # for top level functions, and it also helps the consistency verification made
-  # by update_name_map.
-  if owner_type is not None:
-    new_name = namer.compiled_function_name(f.__name__, f, owner_type)
-  else:
-    new_name = namer.compiled_function_name(f.__name__, f)
+  # TODO(mdan): This somewhat duplicates the call rename logic in call_treest.py
+  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
+  if not did_rename:
+    new_name = f.__name__
+    if node.name != f.__name__:
+      raise NotImplementedError('Strange corner case. Send us offending code!')
+
   node.name = new_name
   conversion_map.update_name_map(namer)
-  return node, conversion_map.name_map[f]
+  return node, new_name
 
 
 def _static_analysis_pass(node, ctx):
-  node = access.resolve(node)
-  node = live_values.resolve(node, ctx.namespace, config.PYTHON_LITERALS)
+  node = access.resolve(node, ctx)
+  node = live_values.resolve(node, ctx, config.PYTHON_LITERALS)
   node = type_info.resolve(node, ctx)
   return node
 
@@ -243,6 +246,7 @@ def node_to_graph(node, ctx, nocompile_decorators):
   node = _static_analysis_pass(node, ctx)
   node = decorators.transform(node, nocompile_decorators)
   node = break_canonicalization.transform(node, ctx.namer)
+  node = asserts.transform(node, ctx)
 
   # Note: sequencing continue canonicalization before for loop one avoids
   # dealing with the extra loop increment operation that the for
@@ -259,8 +263,7 @@ def node_to_graph(node, ctx, nocompile_decorators):
 
   node = _static_analysis_pass(node, ctx)
   node = print_functions.transform(node)
-  node = call_trees.transform(node, ctx.namer, ctx.namespace,
-                              config.DEFAULT_UNCOMPILED_MODULES,
+  node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES,
                               nocompile_decorators)
   node = control_flow.transform(node, ctx.namer)
   node = logical_expressions.transform(node)
diff --git a/tensorflow/contrib/py2tf/conversion_test.py b/tensorflow/contrib/py2tf/impl/conversion_test.py
similarity index 97%
rename from tensorflow/contrib/py2tf/conversion_test.py
rename to tensorflow/contrib/py2tf/impl/conversion_test.py
index 26f915f4f46..3888958f19b 100644
--- a/tensorflow/contrib/py2tf/conversion_test.py
+++ b/tensorflow/contrib/py2tf/impl/conversion_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.contrib.py2tf import conversion
+from tensorflow.contrib.py2tf.impl import conversion
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/py2tf/naming.py b/tensorflow/contrib/py2tf/impl/naming.py
similarity index 66%
rename from tensorflow/contrib/py2tf/naming.py
rename to tensorflow/contrib/py2tf/impl/naming.py
index a90758962b8..5c7e4c5f95a 100644
--- a/tensorflow/contrib/py2tf/naming.py
+++ b/tensorflow/contrib/py2tf/impl/naming.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util import tf_inspect
-
 
 class Namer(object):
   """Implementation of the namer interfaces required by various converters.
@@ -45,10 +43,15 @@ class Namer(object):
 
     self.generated_names = set()
 
-  def compiled_class_name(self, original_name, live_object=None):
+  def compiled_class_name(self, original_fqn, live_entity=None):
     """See call_trees.FunctionNamer.compiled_class_name."""
-    if live_object is not None and live_object in self.renamed_calls:
-      return self.renamed_calls[live_object]
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity]
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
+    else:
+      original_name = original_fqn
 
     new_name_root = 'Tf%s' % original_name
     new_name = new_name_root
@@ -57,41 +60,46 @@ class Namer(object):
       n += 1
       new_name = '%s_%d' % (new_name_root, n)
 
-    if live_object is not None:
-      self.renamed_calls[live_object] = new_name
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     return new_name
 
   def compiled_function_name(self,
-                             original_name,
-                             live_object=None,
+                             original_fqn,
+                             live_entity=None,
                              owner_type=None):
     """See call_trees.FunctionNamer.compiled_function_name."""
-    if live_object is not None and live_object in self.renamed_calls:
-      return self.renamed_calls[live_object]
 
     if not self.recursive:
-      new_name = original_name
-    elif owner_type is None or owner_type in self.partial_types:
-      # Top level functions: rename
-      new_name_root = 'tf__%s' % original_name
-      new_name = new_name_root
-      n = 0
-      while new_name in self.global_namespace:
-        n += 1
-        new_name = '%s_%d' % (new_name_root, n)
-    else:
-      if tf_inspect.isclass(owner_type):
-        # Class members: do not rename (the entire class will be renamed)
-        new_name = original_name
-      else:
-        raise NotImplementedError('Member function "%s" of non-class type: %s' %
-                                  (original_name, owner_type))
+      return None, False
 
-    if live_object is not None:
-      self.renamed_calls[live_object] = new_name
+    if owner_type is not None and owner_type not in self.partial_types:
+      # Members are not renamed when part of an entire converted class.
+      return None, False
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
+    else:
+      original_name = original_fqn
+
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity], True
+
+    new_name_root = 'tf__%s' % original_name
+    new_name = new_name_root
+    n = 0
+    while new_name in self.global_namespace:
+      n += 1
+      new_name = '%s_%d' % (new_name_root, n)
+
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
     self.generated_names.add(new_name)
-    return new_name
+
+    return new_name, True
 
   def new_symbol(self, name_root, reserved_locals):
     """See control_flow.SymbolNamer.new_symbol."""
diff --git a/tensorflow/contrib/py2tf/naming_test.py b/tensorflow/contrib/py2tf/impl/naming_test.py
similarity index 82%
rename from tensorflow/contrib/py2tf/naming_test.py
rename to tensorflow/contrib/py2tf/impl/naming_test.py
index 7bfc9b8733b..beb4e54937b 100644
--- a/tensorflow/contrib/py2tf/naming_test.py
+++ b/tensorflow/contrib/py2tf/impl/naming_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.py2tf import naming
+from tensorflow.contrib.py2tf.impl import naming
 from tensorflow.python.platform import test
 
 
@@ -29,8 +29,9 @@ class NamerTest(test.TestCase):
       pass
 
     namer = naming.Namer({}, True, None, ())
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo'))
-    self.assertEqual('tf__bar', namer.compiled_function_name('bar', bar))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name('foo'))
+    self.assertEqual(('tf__bar', True), namer.compiled_function_name(
+        'bar', bar))
     self.assertEqual({bar: 'tf__bar'}, namer.renamed_calls)
     self.assertItemsEqual(('tf__bar', 'tf__foo'), namer.generated_names)
 
@@ -39,15 +40,18 @@ class NamerTest(test.TestCase):
       pass
 
     namer = naming.Namer({}, True, None, ())
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo))
-    self.assertEqual('tf__foo', namer.compiled_function_name('foo', foo))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
 
   def test_compiled_function_name_avoids_global_conflicts(self):
     def foo():
       pass
 
     namer = naming.Namer({'tf__foo': 1}, True, None, ())
-    self.assertEqual('tf__foo_1', namer.compiled_function_name('foo', foo))
+    self.assertEqual(('tf__foo_1', True),
+                     namer.compiled_function_name('foo', foo))
 
   def test_new_symbol_tracks_names(self):
     namer = naming.Namer({}, True, None, ())
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
index e0331dbc97c..1b2408ba0ea 100644
--- a/tensorflow/contrib/py2tf/pyct/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -31,6 +31,7 @@ py_library(
     deps = [
         "@astor_archive//:astor",
         "@gast_archive//:gast",
+        "@six_archive//:six",
         "@termcolor_archive//:termcolor",
     ],
 )
@@ -38,6 +39,7 @@ py_library(
 py_test(
     name = "anno_test",
     srcs = ["anno_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -47,6 +49,7 @@ py_test(
 py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -57,6 +60,7 @@ py_test(
 py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -66,6 +70,7 @@ py_test(
 py_test(
     name = "pretty_printer_test",
     srcs = ["pretty_printer_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -75,6 +80,7 @@ py_test(
 py_test(
     name = "templates_test",
     srcs = ["templates_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/py2tf/pyct/compiler.py
index b09353cc72b..fc71469d1ea 100644
--- a/tensorflow/contrib/py2tf/pyct/compiler.py
+++ b/tensorflow/contrib/py2tf/pyct/compiler.py
@@ -41,7 +41,7 @@ def ast_to_source(node, indentation):
   return astor.source_repr.pretty_source(generator.result).lstrip()
 
 
-def ast_to_object(node, indentation='  '):
+def ast_to_object(node, indentation='  ', source_prefix=None):
   """Return the Python objects represented by given AST.
 
   Compiling the AST code this way ensures that the source code is readable by
@@ -50,6 +50,7 @@ def ast_to_object(node, indentation='  '):
   Args:
     node: The code to compile, as an AST object.
     indentation: The string to use for indentation.
+    source_prefix: Optional string to print as-is into the source file.
 
   Returns:
     A module object containing the compiled source code.
@@ -58,5 +59,8 @@ def ast_to_object(node, indentation='  '):
 
   with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
     module_name = os.path.basename(f.name[:-3])
+    if source_prefix:
+      f.write(source_prefix)
+      f.write('\n')
     f.write(source)
   return imp.load_source(module_name, f.name)
diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/py2tf/pyct/context.py
index 73f3613d09d..fef74ebefa2 100644
--- a/tensorflow/contrib/py2tf/pyct/context.py
+++ b/tensorflow/contrib/py2tf/pyct/context.py
@@ -33,10 +33,11 @@ class EntityContext(object):
   """
 
   def __init__(self, namer, source_code, source_file, namespace, arg_values,
-               arg_types):
+               arg_types, recursive):
     self.namer = namer
     self.source_code = source_code
     self.source_file = source_file
     self.namespace = namespace
     self.arg_values = {} if arg_values is None else arg_values
     self.arg_types = {} if arg_types is None else arg_types
+    self.recursive = recursive
diff --git a/tensorflow/contrib/py2tf/pyct/parser.py b/tensorflow/contrib/py2tf/pyct/parser.py
index 3daa69b9cef..dc7df883b34 100644
--- a/tensorflow/contrib/py2tf/pyct/parser.py
+++ b/tensorflow/contrib/py2tf/pyct/parser.py
@@ -28,11 +28,13 @@ import gast
 from tensorflow.python.util import tf_inspect
 
 
-def parse_object(obj):
-  """Return the AST of given object."""
-  return parse_str(tf_inspect.getsource(obj))
+def parse_entity(entity):
+  """Return the AST of given entity."""
+  source = tf_inspect.getsource(entity)
+  source = textwrap.dedent(source)
+  return parse_str(source), source
 
 
 def parse_str(src):
   """Return the AST of given piece of code."""
-  return gast.parse(textwrap.dedent(src))
+  return gast.parse(src)
diff --git a/tensorflow/contrib/py2tf/pyct/parser_test.py b/tensorflow/contrib/py2tf/pyct/parser_test.py
index 46f9aa82071..f35dfa04c70 100644
--- a/tensorflow/contrib/py2tf/pyct/parser_test.py
+++ b/tensorflow/contrib/py2tf/pyct/parser_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import textwrap
+
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.python.platform import test
 
@@ -28,15 +30,16 @@ def f(x):
 
 class ParserTest(test.TestCase):
 
-  def test_parse_object(self):
-    mod = parser.parse_object(f)
+  def test_parse_entity(self):
+    mod, _ = parser.parse_entity(f)
     self.assertEqual('f', mod.body[0].name)
 
   def test_parse_str(self):
-    mod = parser.parse_str("""
+    mod = parser.parse_str(
+        textwrap.dedent("""
         def f(x):
           return x + 1
-    """)
+    """))
     self.assertEqual('f', mod.body[0].name)
 
 
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
index abaf9536781..32e2954fffc 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@@ -32,6 +32,7 @@ py_library(
 py_test(
     name = "access_test",
     srcs = ["access_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -43,6 +44,7 @@ py_test(
 py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
@@ -53,6 +55,7 @@ py_test(
 py_test(
     name = "type_info_test",
     srcs = ["type_info_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/py2tf/pyct",
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
index 8f3ac48b68c..33629f87d1d 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access.py
@@ -23,6 +23,7 @@ import copy
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
 
 # TODO(mdan): Add support for PY3 (e.g. Param vs arg).
 
@@ -53,6 +54,8 @@ class Scope(object):
     self.modified = set()
     self.created = set()
     self.used = set()
+    self.params = set()
+    self.returned = set()
 
   # TODO(mdan): Rename to `locals`
   @property
@@ -69,42 +72,73 @@ class Scope(object):
     self.modified = copy.copy(other.modified)
     self.created = copy.copy(other.created)
     self.used = copy.copy(other.used)
+    self.params = copy.copy(other.params)
+    self.returned = copy.copy(other.returned)
 
   def merge_from(self, other):
     self.modified |= other.modified
     self.created |= other.created
     self.used |= other.used
+    self.params |= other.params
+    self.returned |= other.returned
 
   def has(self, name):
-    if name in self.modified:
+    if name in self.modified or name in self.params:
       return True
     elif self.parent is not None:
       return self.parent.has(name)
     return False
 
+  def is_modified_since_entry(self, name):
+    if name in self.modified:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_modified_since_entry(name)
+    return False
+
+  def is_param(self, name):
+    if name in self.params:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_param(name)
+    return False
+
   def mark_read(self, name):
     self.used.add(name)
     if self.parent is not None and name not in self.created:
       self.parent.mark_read(name)
 
+  def mark_param(self, name):
+    self.params.add(name)
+
+  def mark_creation(self, name):
+    self.created.add(name)
+
   def mark_write(self, name):
     self.modified.add(name)
     if self.isolated:
-      self.created.add(name)
+      self.mark_creation(name)
     else:
       if self.parent is None:
-        self.created.add(name)
+        self.mark_creation(name)
       else:
         if not self.parent.has(name):
-          self.created.add(name)
+          self.mark_creation(name)
         self.parent.mark_write(name)
 
+  def mark_returned(self, name):
+    self.returned.add(name)
+    if not self.isolated and self.parent is not None:
+      self.parent.mark_returned(name)
 
-class AccessResolver(gast.NodeTransformer):
+
+class AccessResolver(transformer.Base):
   """Annotates nodes with local scope information. See Scope."""
 
-  def __init__(self):
+  def __init__(self, context):
+    super(AccessResolver, self).__init__(context)
     self.scope = Scope(None)
+    self._in_return_statement = False
 
   def visit_Name(self, node):
     # TODO(mdan): This is insufficient for object fields, e.g. hp.learning_rate.
@@ -120,10 +154,17 @@ class AccessResolver(gast.NodeTransformer):
       # TODO(mdan): This bay be incorrect with nested functions.
       # For nested functions, we'll have to add the notion of hiding args from
       # the parent scope, not writing to them.
-      self.scope.mark_write(node.id)
+      self.scope.mark_creation(node.id)
+      self.scope.mark_param(node.id)
     else:
       raise ValueError('Unknown context %s for node %s.' % (type(node.ctx),
                                                             node.id))
+    anno.setanno(node, 'is_modified_since_entry',
+                 self.scope.is_modified_since_entry(node.id))
+    anno.setanno(node, 'is_param', self.scope.is_param(node.id))
+
+    if self._in_return_statement:
+      self.scope.mark_returned(node.id)
     return node
 
   def visit_Print(self, node):
@@ -138,7 +179,7 @@ class AccessResolver(gast.NodeTransformer):
 
   def visit_Call(self, node):
     current_scope = self.scope
-    args_scope = Scope(current_scope)
+    args_scope = Scope(current_scope, isolated=False)
     self.scope = args_scope
     for n in node.args:
       self.visit(n)
@@ -200,6 +241,12 @@ class AccessResolver(gast.NodeTransformer):
         node, ((node.body, 'body'), (node.orelse, 'orelse')))
     return node
 
+  def visit_Return(self, node):
+    self._in_return_statement = True
+    node = self.generic_visit(node)
+    self._in_return_statement = False
+    return node
 
-def resolve(node):
-  return AccessResolver().visit(node)
+
+def resolve(node, context):
+  return AccessResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
index 0912ebb4c35..df0283b54d5 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/access_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.python.platform import test
@@ -95,6 +96,19 @@ class ScopeTest(test.TestCase):
 
 class AccessResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self, test_fn):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace={},
+        arg_values=None,
+        arg_types=None,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    return node
+
   def test_local_markers(self):
 
     def test_fn(a):  # pylint:disable=unused-argument
@@ -103,9 +117,7 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     self.assertFalse(anno.getanno(node.body[0].body[0].value,
                                   'is_local'))  # c in b = c
     self.assertTrue(anno.getanno(node.body[0].body[1].test.left,
@@ -126,9 +138,7 @@ class AccessResolverTest(test.TestCase):
       print(a, b)
       return c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     print_node = node.body[0].body[2]
     if isinstance(print_node, gast.Print):
       # Python 2
@@ -151,9 +161,7 @@ class AccessResolverTest(test.TestCase):
       foo(a, b)  # pylint:disable=undefined-variable
       return c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[2].value
     # We basically need to detect which variables are captured by the call
     # arguments.
@@ -169,15 +177,13 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b, c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     while_node = node.body[0].body[1]
     self.assertScopeIs(
         anno.getanno(while_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
     self.assertScopeIs(
         anno.getanno(while_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('a', 'b', 'c'), ('a', 'b', 'c'))
+        ('b', 'c'), ('a', 'b', 'c'))
 
   def test_for(self):
 
@@ -188,15 +194,13 @@ class AccessResolverTest(test.TestCase):
         b -= 1
       return b, c
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     for_node = node.body[0].body[1]
     self.assertScopeIs(
         anno.getanno(for_node, 'body_scope'), ('b',), ('b', 'c'), ('c',))
     self.assertScopeIs(
         anno.getanno(for_node, 'body_parent_scope'), ('a', 'b', 'c'),
-        ('a', 'b', 'c', '_'), ('a', 'b', 'c', '_'))
+        ('b', 'c', '_'), ('a', 'b', 'c', '_'))
 
   def test_if(self):
 
@@ -211,9 +215,7 @@ class AccessResolverTest(test.TestCase):
         u = -y
       return z, u
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-
+    node = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
     self.assertScopeIs(
         anno.getanno(if_node, 'body_scope'), ('x', 'y'), ('x', 'y', 'z'),
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
index 242e544b528..5a2903e6b59 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
@@ -26,26 +26,19 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
 
 
-class LiveValueResolver(gast.NodeTransformer):
+class LiveValueResolver(transformer.Base):
   """Annotates nodes with live values."""
 
-  def __init__(self, namespace, literals):
-    """Create a new resolver.
-
-    Args:
-      namespace: A dict representing the namespace visible to the AST in the
-          intended execution context.
-      literals: A dict mapping literal lymbol names to their value. An example
-          literal is "None".
-    """
-    self.namespace = namespace
+  def __init__(self, context, literals):
+    super(LiveValueResolver, self).__init__(context)
     self.literals = literals
 
   def visit_ClassDef(self, node):
     self.generic_visit(node)
-    anno.setanno(node, 'live_val', self.namespace[node.name])
+    anno.setanno(node, 'live_val', self.context.namespace[node.name])
     return node
 
   def visit_Name(self, node):
@@ -53,20 +46,31 @@ class LiveValueResolver(gast.NodeTransformer):
     if isinstance(node.ctx, gast.Load):
       assert anno.hasanno(node, 'is_local'), node
       symbol_is_local = anno.getanno(node, 'is_local')
-      if not symbol_is_local:
+      assert anno.hasanno(node, 'is_modified_since_entry'), node
+      symbol_is_modified = anno.getanno(node, 'is_modified_since_entry')
+      assert anno.hasanno(node, 'is_param'), node
+      symbol_is_param = anno.getanno(node, 'is_param')
+
+      if not symbol_is_local and not symbol_is_param:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
           # TODO(mdan): Could live values have FQNs? i.e. 'a'.join()
-        elif node.id in self.namespace:
-          obj = self.namespace[node.id]
+        elif node.id in self.context.namespace:
+          obj = self.context.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
           anno.setanno(node, 'fqn', (obj.__name__,))
         else:
-          raise ValueError('Could not find global symbol %s.' % node.id)
+          raise ValueError('Could not resolve symbol "%s".' % node.id)
       else:
         pass
         # TODO(mdan): Attempt to trace its value through the local chain.
         # TODO(mdan): Use type annotations as fallback.
+
+      if not symbol_is_modified:
+        if node.id in self.context.arg_values:
+          obj = self.context.arg_values[node.id]
+          anno.setanno(node, 'live_val', obj)
+          anno.setanno(node, 'fqn', (obj.__class__.__name__,))
     return node
 
   def visit_Attribute(self, node):
@@ -79,15 +83,25 @@ class LiveValueResolver(gast.NodeTransformer):
                                                          node.attr))
       anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
       anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,))
+    # TODO(mdan): Investigate the role built-in annotations can play here.
+    elif anno.hasanno(node.value, 'type'):
+      parent_type = anno.getanno(node.value, 'type')
+      if hasattr(parent_type, node.attr):
+        # This should hold for static members like methods.
+        # This would not hold for dynamic members like function attributes.
+        # For the dynamic case, we simply leave the node without an annotation,
+        # and let downstream consumers figure out what to do.
+        anno.setanno(node, 'live_val', getattr(parent_type, node.attr))
+        anno.setanno(node, 'fqn',
+                     anno.getanno(node.value, 'type_fqn') + (node.attr,))
     elif isinstance(node.value, gast.Name):
       stem_name = node.value
       # All nonlocal symbols should be fully resolved.
       assert anno.hasanno(stem_name, 'is_local'), stem_name
-      assert anno.getanno(stem_name, 'is_local'), stem_name
       # TODO(mdan): Figure out what to do when calling attribute on local object
       # Maybe just leave as-is?
     return node
 
 
-def resolve(node, namespace, literals):
-  return LiveValueResolver(namespace, literals).visit(node)
+def resolve(node, context, literals):
+  return LiveValueResolver(context, literals).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
index e77497654a0..f3057b34667 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
@@ -19,24 +19,45 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
 
 class LiveValuesResolverTest(test.TestCase):
 
+  def _parse_and_analyze(self,
+                         test_fn,
+                         namespace,
+                         literals=None,
+                         arg_types=None):
+    literals = literals or {}
+    arg_types = arg_types or {}
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    return node
+
   def test_literals(self):
 
     def test_fn():
       return Foo  # pylint: disable=undefined-variable
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {}, {'Foo': 'bar'})
-
+    node = self._parse_and_analyze(test_fn, {}, {'Foo': 'bar'})
     retval_node = node.body[0].body[0].value
     self.assertEquals('bar', anno.getanno(retval_node, 'live_val'))
 
@@ -48,10 +69,7 @@ class LiveValuesResolverTest(test.TestCase):
     def test_fn():
       return foo()
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'foo': foo}, {})
-
+    node = self._parse_and_analyze(test_fn, {'foo': foo})
     func_node = node.body[0].body[0].value.func
     self.assertEquals(foo, anno.getanno(func_node, 'live_val'))
     self.assertEquals(('foo',), anno.getanno(func_node, 'fqn'))
@@ -61,15 +79,29 @@ class LiveValuesResolverTest(test.TestCase):
     def test_fn():
       return constant_op.constant(0)
 
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, {'constant_op': constant_op}, {})
-
+    node = self._parse_and_analyze(test_fn, {'constant_op': constant_op})
     func_node = node.body[0].body[0].value.func
     self.assertEquals(constant_op.constant, anno.getanno(func_node, 'live_val'))
     self.assertEquals((constant_op.__name__, 'constant'),
                       anno.getanno(func_node, 'fqn'))
 
+  def test_attributes_with_type_hints(self):
+
+    class TestClass(object):
+
+      def member(self):
+        pass
+
+      def test_fn(self):
+        return self.member()
+
+    node = self._parse_and_analyze(
+        TestClass.test_fn, {'constant_op': constant_op},
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    func_node = node.body[0].body[0].value.func
+    self.assertEquals(TestClass.member, anno.getanno(func_node, 'live_val'))
+    self.assertEquals(('TestClass', 'member'), anno.getanno(func_node, 'fqn'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
index 0042aa90ed2..cf74142cbe9 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
@@ -36,8 +36,6 @@ class Scope(object):
         most recently assigned to the symbol.
   """
 
-  # TODO(mdan): Should rather use a CFG here?
-
   def __init__(self, parent):
     """Create a new scope.
 
@@ -117,18 +115,32 @@ class TypeInfoResolver(transformer.Base):
     node.orelse = self._visit_block(node.orelse)
     return node
 
+  def _process_function_arg(self, arg_name):
+    if self.function_level == 1 and arg_name in self.context.arg_types:
+      # Forge a node to hold the type information, so that method calls on
+      # it can resolve the type.
+      type_holder = gast.Name(arg_name, gast.Load(), None)
+      type_string, type_obj = self.context.arg_types[arg_name]
+      anno.setanno(type_holder, 'type', type_obj)
+      anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
+      self.scope.setval(arg_name, type_holder)
+
+  def visit_arg(self, node):
+    self._process_function_arg(node.arg)
+    return node
+
   def visit_Name(self, node):
     self.generic_visit(node)
     if isinstance(node.ctx, gast.Param):
-      self.scope.setval(node.id, gast.Name(node.id, gast.Load(), None))
-      if self.function_level == 1 and node.id in self.context.arg_types:
-        # Forge a node to hold the type information, so that method calls on
-        # it can resolve the type.
-        type_holder = gast.Name(node.id, gast.Load(), None)
-        type_string, type_obj = self.context.arg_types[node.id]
-        anno.setanno(type_holder, 'type', type_obj)
-        anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
-        self.scope.setval(node.id, type_holder)
+      self._process_function_arg(node.id)
+    elif isinstance(node.ctx, gast.Load) and self.scope.hasval(node.id):
+      # E.g. if we had
+      # a = b
+      # then for future references to `a` we should have traced_source = `b`
+      traced_source = self.scope.getval(node.id)
+      if anno.hasanno(traced_source, 'type'):
+        anno.setanno(node, 'type', anno.getanno(traced_source, 'type'))
+        anno.setanno(node, 'type_fqn', anno.getanno(traced_source, 'type_fqn'))
     return node
 
   def _process_variable_assignment(self, source, targets):
@@ -172,38 +184,6 @@ class TypeInfoResolver(transformer.Base):
     self._process_variable_assignment(node.value, node.targets)
     return node
 
-  def visit_Call(self, node):
-    target = node.func
-    if not anno.hasanno(target, 'live_val'):
-      if not isinstance(target, gast.Attribute):
-        # Suspecting this pattern would reach here:
-        #   foo = bar
-        #   foo()
-        raise ValueError('Dont know how to handle dynamic functions.')
-      if not isinstance(target.value, gast.Name):
-        # Possible example of this kind:
-        #   foo = module.Foo()
-        #   foo.bar.baz()
-        # TODO(mdan): This should be doable by using the FQN.
-        raise ValueError('Dont know how to handle object properties yet.')
-      # In the example below, object_source is 'tr.train.Optimizer()':
-      #   opt = tf.train.Optimizer()
-      #   opt.foo()
-      if self.scope.hasval(target.value.id):
-        object_source = self.scope.getval(target.value.id)
-        if not anno.hasanno(object_source, 'type'):
-          raise ValueError('Could not determine type of "%s". Is it dynamic?' %
-                           (target.value.id))
-        anno.setanno(target, 'type', anno.getanno(object_source, 'type'))
-        anno.setanno(target, 'type_fqn', anno.getanno(object_source,
-                                                      'type_fqn'))
-      else:
-        # TODO(mdan): Figure out what could the user do to get past this.
-        raise ValueError('No info on "%s". Is it dynamically built?' %
-                         (target.value.id))
-    self.generic_visit(node)
-    return node
-
 
 def resolve(node, context):
   return TypeInfoResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
index a491f49ca3b..68fa1ee92a7 100644
--- a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.py2tf.pyct import anno
 from tensorflow.contrib.py2tf.pyct import context
 from tensorflow.contrib.py2tf.pyct import parser
-from tensorflow.contrib.py2tf.pyct import transformer
 from tensorflow.contrib.py2tf.pyct.static_analysis import access
 from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
 from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
@@ -57,17 +56,19 @@ class ScopeTest(test.TestCase):
 class TypeInfoResolverTest(test.TestCase):
 
   def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+    node, source = parser.parse_entity(test_fn)
     ctx = context.EntityContext(
         namer=None,
-        source_code=None,
+        source_code=source,
         source_file=None,
         namespace=namespace,
         arg_values=None,
-        arg_types=arg_types)
-    node = parser.parse_object(test_fn)
-    node = access.resolve(node)
-    node = live_values.resolve(node, namespace, {})
+        arg_types=arg_types,
+        recursive=True)
+    node = access.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     return node
 
   def test_constructor_detection(self):
@@ -83,16 +84,16 @@ class TypeInfoResolverTest(test.TestCase):
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
                       anno.getanno(call_node, 'type_fqn'))
 
-  def test_class_members(self):
+  def test_class_members_of_detected_constructor(self):
 
     def test_fn():
       opt = training.GradientDescentOptimizer(0.1)
       opt.minimize(0)
 
     node = self._parse_and_analyze(test_fn, {'training': training})
-    attr_call_node = node.body[0].body[1].value.func
-    self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
-                      anno.getanno(attr_call_node, 'type_fqn'))
+    method_call = node.body[0].body[1].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
 
   def test_class_members_in_with_stmt(self):
 
@@ -106,11 +107,11 @@ class TypeInfoResolverTest(test.TestCase):
     self.assertEquals((session.__name__, 'Session'),
                       anno.getanno(constructor_call, 'type_fqn'))
 
-    member_call = node.body[0].body[0].body[0].value.func
-    self.assertEquals((session.__name__, 'Session'),
-                      anno.getanno(member_call, 'type_fqn'))
+    method_call = node.body[0].body[0].body[0].value.func
+    self.assertEquals(session.Session.run, anno.getanno(method_call,
+                                                        'live_val'))
 
-  def test_constructor_deta_dependent(self):
+  def test_constructor_data_dependent(self):
 
     def test_fn(x):
       if x > 0:
@@ -119,16 +120,18 @@ class TypeInfoResolverTest(test.TestCase):
         opt = training.GradientDescentOptimizer(0.01)
       opt.minimize(0)
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'training': training})
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members(self):
 
     def test_fn(opt):
       opt.minimize(0)
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'training': training})
+    node = self._parse_and_analyze(test_fn, {})
+    method_call = node.body[0].body[0].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_parameter_class_members_with_value_hints(self):
 
@@ -138,14 +141,13 @@ class TypeInfoResolverTest(test.TestCase):
     node = self._parse_and_analyze(
         test_fn, {'training': training},
         arg_types={
-            'opt': (('%s.GradientDescentOptimizer' % training.__name__),
-                    training.GradientDescentOptimizer(0.1))
+            'opt': (training.GradientDescentOptimizer.__name__,
+                    training.GradientDescentOptimizer)
         })
 
-    attr_call_node = node.body[0].body[0].value.func
-    self.assertEquals(
-        tuple(training.__name__.split('.')) + ('GradientDescentOptimizer',),
-        anno.getanno(attr_call_node, 'type_fqn'))
+    method_call = node.body[0].body[0].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
 
   def test_function_variables(self):
 
@@ -156,8 +158,9 @@ class TypeInfoResolverTest(test.TestCase):
       foo = bar
       foo()
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'bar': bar})
+    node = self._parse_and_analyze(test_fn, {'bar': bar})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
   def test_nested_members(self):
 
@@ -165,8 +168,9 @@ class TypeInfoResolverTest(test.TestCase):
       foo = training.GradientDescentOptimizer(0.1)
       foo.bar.baz()
 
-    with self.assertRaises(transformer.PyFlowParseError):
-      self._parse_and_analyze(test_fn, {'training': training})
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
index 77c5fbe02a1..6be526f20dd 100644
--- a/tensorflow/contrib/py2tf/pyct/templates.py
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 import ast
 import copy
+import textwrap
 
 import gast
 
@@ -119,7 +120,7 @@ def replace(template, **replacements):
   """
   if not isinstance(template, str):
     raise ValueError('Expected string template, got %s' % type(template))
-  tree = parser.parse_str(template)
+  tree = parser.parse_str(textwrap.dedent(template))
   for k in replacements:
     replacements[k] = _strings_to_names(replacements[k])
   return ReplaceTransformer(replacements).visit(tree).body
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
index d5aa23eaebb..8a836b7c1bf 100644
--- a/tensorflow/contrib/py2tf/pyct/transformer.py
+++ b/tensorflow/contrib/py2tf/pyct/transformer.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
 import gast
+import six
 
 from tensorflow.contrib.py2tf.pyct import pretty_printer
 
@@ -48,11 +51,15 @@ class Base(gast.NodeTransformer):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
       return super(Base, self).visit(node)
-    except ValueError as e:
-      msg = '%s\nOccurred at node:\n%s' % (str(e), pretty_printer.fmt(node))
+    except (ValueError, AttributeError, NotImplementedError) as e:
+      msg = '%s: %s\nOccurred at node:\n%s' % (e.__class__.__name__, str(e),
+                                               pretty_printer.fmt(node))
       if source_code:
-        line = self._source.splitlines()[self._lineno - 1]
+        line = source_code.splitlines()[self._lineno - 1]
       else:
         line = '<no source available>'
-      raise PyFlowParseError(
-          msg, (source_file, self._lineno, self._col_offset + 1, line))
+      six.reraise(PyFlowParseError,
+                  PyFlowParseError(
+                      msg,
+                      (source_file, self._lineno, self._col_offset + 1, line)),
+                  sys.exc_info()[2])
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
new file mode 100644
index 00000000000..01804aa8834
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -0,0 +1,37 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "utils",
+    srcs = [
+        "__init__.py",
+        "context_managers.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+    ],
+)
+
+py_test(
+    name = "context_managers_test",
+    srcs = ["context_managers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
new file mode 100644
index 00000000000..bca33e89e99
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility module that contains APIs usable in the generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/py2tf/utils/context_managers.py
new file mode 100644
index 00000000000..47d98399971
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various context managers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+def control_dependency_on_returns(tf, return_value):
+  """Create a TF control dependency on the return values of a function.
+
+  If the function had no return value, a no-op context is returned.
+
+  Args:
+    tf: The TensorFlow module.
+    return_value: The return value to set as control dependency.
+
+  Returns:
+    A context manager.
+  """
+  if return_value is None:
+    return contextlib.contextmanager(lambda: (yield))()
+  # TODO(mdan): Filter to tensor objects.
+  if not isinstance(return_value, (list, tuple)):
+    return_value = (return_value,)
+  return tf.control_dependencies(return_value)
diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/py2tf/utils/context_managers_test.py
new file mode 100644
index 00000000000..c903f082528
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers_test.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for context_managers module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils import context_managers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class ContextManagersTest(test.TestCase):
+
+  def test_control_dependency_on_returns(self):
+    # Just dry run them.
+    with context_managers.control_dependency_on_returns(ops, None):
+      pass
+    with context_managers.control_dependency_on_returns(
+        ops, constant_op.constant(1)):
+      pass
+    with context_managers.control_dependency_on_returns(
+        ops, [constant_op.constant(1),
+              constant_op.constant(2)]):
+      pass
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 3c5b34a0a6a..b7d525a1fa2 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -77,9 +77,13 @@ py_library(
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
     ],
 )
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index aa605e6caad..8ec5334a393 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import re
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib.quantize.python import common
@@ -26,14 +25,16 @@ from tensorflow.contrib.quantize.python import input_to_ops
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
-def FoldBatchNorms(graph):
+def FoldBatchNorms(graph, freeze_batch_norm_delay=None, is_training=True):
   """Finds batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -41,15 +42,25 @@ def FoldBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization. This value
+    is used only when is_training is True.
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When batch norm folding fails.
   """
-  _FoldFusedBatchNorms(graph)
-  _FoldUnfusedBatchNorms(graph)
+  _FoldFusedBatchNorms(
+      graph,
+      freeze_batch_norm_delay=freeze_batch_norm_delay,
+      is_training=is_training)
+  _FoldUnfusedBatchNorms(
+      graph,
+      freeze_batch_norm_delay=freeze_batch_norm_delay,
+      is_training=is_training)
 
 
-def _FoldFusedBatchNorms(graph):
+def _FoldFusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
   """Finds fused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -57,6 +68,9 @@ def _FoldFusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -67,8 +81,7 @@ def _FoldFusedBatchNorms(graph):
     # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
     # named `scope`. Otherwise, TF creates a unique scope whose name starts with
     # `scope`.
-    with graph.as_default(), graph.name_scope(scope + sep), ops.device(
-        match.bn_op.device):
+    with graph.as_default(), graph.name_scope(scope + sep):
       with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
         # new weights = old weights * gamma / sqrt(variance + epsilon)
         # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
@@ -79,9 +92,18 @@ def _FoldFusedBatchNorms(graph):
             match.mean_tensor * multiplier_tensor,
             name='bias')
 
+        correction_scale, correction_recip, correction_offset = None, None, None
+        if is_training:
+          correction_scale, correction_recip, correction_offset = (
+              _ComputeBatchNormCorrections(
+                  context='',
+                  match=match,
+                  freeze_batch_norm_delay=freeze_batch_norm_delay,
+                  fused_batch_norm=True))
         # The shape of depthwise weights is different, so we need to reshape the
         # multiplier_tensor to ensure that the scaled_weight_tensor has the
         # expected shape.
+        weights = match.weight_tensor
         if match.layer_op.type == 'DepthwiseConv2dNative':
           new_shape = [
               match.weight_tensor.get_shape().as_list()[2],
@@ -90,15 +112,29 @@ def _FoldFusedBatchNorms(graph):
           multiplier_tensor = array_ops.reshape(
               multiplier_tensor, new_shape, name='scale_reshape')
 
+          if correction_scale is not None:
+            correction_scale = array_ops.reshape(
+                correction_scale, new_shape, name='correction_reshape')
+
+      if correction_scale is not None:
+        weights = math_ops.multiply(
+            correction_scale, weights, name='correction_mult')
+
       # TODO(suharshs): This naming of the following ops needs to carefully
       # follow the naming expected by quantize.py. Generalize the quantize code
       # to not require these delicate naming conventions.
       scaled_weight_tensor = math_ops.multiply(
-          match.weight_tensor, multiplier_tensor, name='mul_fold')
+          weights, multiplier_tensor, name='mul_fold')
 
       new_layer_tensor = _CloneWithNewOperands(
           match.layer_op, match.input_tensor, scaled_weight_tensor)
 
+      if correction_recip is not None:
+        new_layer_tensor = math_ops.multiply(
+            correction_recip, new_layer_tensor, name='post_conv_mul')
+        new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset),
+                                        'correction_add')
+
       bias_add_tensor = math_ops.add(
           new_layer_tensor, bias_tensor, name='add_fold')
 
@@ -165,6 +201,8 @@ def _FindFusedBatchNorms(graph):
   mean_pattern = graph_matcher.OpTypePattern('*')
   variance_pattern = graph_matcher.OpTypePattern('*')
 
+  moving_average_pattern = graph_matcher.OpTypePattern('*')
+  bn_decay_pattern = graph_matcher.OpTypePattern('*')
   conv_pattern = graph_matcher.OpTypePattern(
       'Conv2D|DepthwiseConv2dNative', inputs=[input_pattern, weight_pattern])
   # MatMul has a Reshape between it and FusedBatchNorm.
@@ -180,6 +218,11 @@ def _FindFusedBatchNorms(graph):
           conv_pattern, gamma_pattern, beta_pattern, mean_pattern,
           variance_pattern
       ])
+  conv_moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, conv_batch_norm_pattern])
+  # TODO(suharshs): Use a OneofPattern here when available
+  conv_moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[conv_moving_average_sub_pattern, bn_decay_pattern])
   matmul_batch_norm_pattern = graph_matcher.OpTypePattern(
       'FusedBatchNorm',
       inputs=[
@@ -191,8 +234,34 @@ def _FindFusedBatchNorms(graph):
       inputs=[matmul_batch_norm_pattern,
               graph_matcher.OpTypePattern('*')])
 
+  matmul_moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, matmul_batch_norm_pattern])
+  matmul_moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[matmul_moving_average_sub_pattern, bn_decay_pattern])
+
   conv_matcher = graph_matcher.GraphMatcher(conv_batch_norm_pattern)
   matmul_matcher = graph_matcher.GraphMatcher(matmul_bn_output_reshape_pattern)
+  conv_moving_average_mul_matcher = graph_matcher.GraphMatcher(
+      conv_moving_average_mul_pattern)
+  matmul_moving_average_mul_matcher = graph_matcher.GraphMatcher(
+      matmul_moving_average_mul_pattern)
+
+  def _GetMovingAverageTensors(graph, moving_avg_mul_matcher,
+                               moving_avg_sub_pattern, bn_op):
+    """Gets the moving mean and variance tensors and the batch norm momentum."""
+    for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
+      sub_op = mul_match_result.get_op(moving_avg_sub_pattern)
+
+      if sub_op.inputs[1].name == bn_op.outputs[1].name:
+        # During training: Batch Mean is bn_op.outputs[1]
+        moving_mean_tensor = sub_op.inputs[0]
+        bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+      if sub_op.inputs[1].name == bn_op.outputs[2].name:
+        # During training: Batch Var is bn_op.outputs[2]
+        moving_variance_tensor = sub_op.inputs[0]
+        bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+    return (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+            bn_decay_var_tensor)
 
   def _GetCommonTensors(match_result, bn_op, bn_input_tensor):
     """Gets tensors needed for FusedBatchNormMatch from match_result."""
@@ -222,10 +291,14 @@ def _FindFusedBatchNorms(graph):
       # calculation, the variance is corrected by the term N/N-1 (Bessel's
       # correction). The variance tensor read from FuseBatchNorm has bessel's
       # correction applied, so we undo it here.
-      n = math_ops.cast(
-          array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor),
-          dtypes.float32)
-      variance_tensor = bn_op.outputs[2] * (n - 1) / n
+      scope, sep, _ = bn_op.name.rpartition('/')
+      g = ops.get_default_graph()
+      with g.as_default(), g.name_scope(scope + sep):
+        n = math_ops.cast(
+            array_ops.size(bn_input_tensor) / array_ops.size(mean_tensor),
+            dtypes.float32)
+        variance_tensor = math_ops.multiply(
+            bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction')
     else:
       mean_tensor = match_result.get_tensor(mean_pattern)
       variance_tensor = match_result.get_tensor(variance_pattern)
@@ -233,15 +306,30 @@ def _FindFusedBatchNorms(graph):
             variance_tensor)
 
   for match_result in conv_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
     layer_op = match_result.get_op(conv_pattern)
     layer_tensor = match_result.get_tensor(conv_pattern)
     bn_op = match_result.get_op(conv_batch_norm_pattern)
-    # In the case of convolution the output_tensor is the output of bn_op.
-    output_tensor = bn_op.outputs[0]
+    if bn_op.get_attr('is_training'):
+      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+       bn_decay_var_tensor) = _GetMovingAverageTensors(
+           graph,
+           moving_avg_mul_matcher=conv_moving_average_mul_matcher,
+           moving_avg_sub_pattern=conv_moving_average_sub_pattern,
+           bn_op=bn_op)
 
+    output_tensor = bn_op.outputs[0]
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
     (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-     variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor)
-    yield _FusedBatchNormMatch(
+     variance_tensor) = _GetCommonTensors(
+         match_result,
+         bn_op,
+         layer_tensor,
+     )
+    yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -250,20 +338,38 @@ def _FindFusedBatchNorms(graph):
         gamma_tensor=gamma_tensor,
         beta_tensor=beta_tensor,
         mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
+        variance_tensor=variance_tensor,
+        moving_mean_tensor=moving_mean_tensor,
+        moving_variance_tensor=moving_variance_tensor,
+        bn_decay_mean_tensor=bn_decay_mean_tensor,
+        bn_decay_var_tensor=bn_decay_var_tensor,
+        batch_epsilon_tensor=batch_epsilon_tensor)
 
   for match_result in matmul_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
     layer_op = match_result.get_op(matmul_pattern)
     layer_tensor = match_result.get_tensor(matmul_pattern)
     bn_op = match_result.get_op(matmul_batch_norm_pattern)
+    if bn_op.get_attr('is_training'):
+      (moving_mean_tensor, bn_decay_mean_tensor, moving_variance_tensor,
+       bn_decay_var_tensor) = _GetMovingAverageTensors(
+           graph,
+           moving_avg_mul_matcher=matmul_moving_average_mul_matcher,
+           moving_avg_sub_pattern=matmul_moving_average_sub_pattern,
+           bn_op=bn_op)
+
     # In the MatMul case, the output of batch norm is reshaped back into a
     # 2D tensor, so the output_tensor is the output of the Reshape op.
     output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
     output_tensor = output_reshape_op.outputs[0]
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
 
     (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
      variance_tensor) = _GetCommonTensors(match_result, bn_op, layer_tensor)
-    yield _FusedBatchNormMatch(
+    yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -272,15 +378,21 @@ def _FindFusedBatchNorms(graph):
         gamma_tensor=gamma_tensor,
         beta_tensor=beta_tensor,
         mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
+        variance_tensor=variance_tensor,
+        moving_mean_tensor=moving_mean_tensor,
+        moving_variance_tensor=moving_variance_tensor,
+        bn_decay_mean_tensor=bn_decay_mean_tensor,
+        bn_decay_var_tensor=bn_decay_var_tensor,
+        batch_epsilon_tensor=batch_epsilon_tensor)
 
 
-class _FusedBatchNormMatch(object):
-  """Contains all information related to a found FusedBatchNorm."""
+class _BatchNormMatch(object):
+  """Contains all information related to a found Fused/UnfusedBatchNorm."""
 
   def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
                weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-               variance_tensor):
+               variance_tensor, moving_mean_tensor, moving_variance_tensor,
+               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor):
     self._layer_op = layer_op
     self._bn_op = bn_op
     self._output_tensor = output_tensor
@@ -290,6 +402,11 @@ class _FusedBatchNormMatch(object):
     self._beta_tensor = beta_tensor
     self._mean_tensor = mean_tensor
     self._variance_tensor = variance_tensor
+    self._moving_mean_tensor = moving_mean_tensor
+    self._moving_variance_tensor = moving_variance_tensor
+    self._bn_decay_mean_tensor = bn_decay_mean_tensor
+    self._bn_decay_var_tensor = bn_decay_var_tensor
+    self._batch_epsilon_tensor = batch_epsilon_tensor
 
   @property
   def layer_op(self):
@@ -327,8 +444,28 @@ class _FusedBatchNormMatch(object):
   def variance_tensor(self):
     return self._variance_tensor
 
+  @property
+  def moving_mean_tensor(self):
+    return self._moving_mean_tensor
 
-def _FoldUnfusedBatchNorms(graph):
+  @property
+  def moving_variance_tensor(self):
+    return self._moving_variance_tensor
+
+  @property
+  def batch_epsilon_tensor(self):
+    return self._batch_epsilon_tensor
+
+  @property
+  def bn_decay_mean_tensor(self):
+    return self._bn_decay_mean_tensor
+
+  @property
+  def bn_decay_var_tensor(self):
+    return self._bn_decay_var_tensor
+
+
+def _FoldUnfusedBatchNorms(graph, freeze_batch_norm_delay, is_training):
   """Finds unfused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -336,6 +473,9 @@ def _FoldUnfusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, True if training
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -346,7 +486,12 @@ def _FoldUnfusedBatchNorms(graph):
     has_scaling = _HasScaling(graph, input_to_ops_map, bn)
 
     # The mangling code intimately depends on BatchNorm node's internals.
-    original_op, folded_op = _CreateFoldedOp(graph, bn, has_scaling=has_scaling)
+    original_op, folded_op = _CreateFoldedOp(
+        graph,
+        bn,
+        has_scaling=has_scaling,
+        freeze_batch_norm_delay=freeze_batch_norm_delay,
+        is_training=is_training)
 
     activation = common.GetEndpointActivationOp(graph, bn)
     if activation:
@@ -407,7 +552,186 @@ def _HasScaling(graph, input_to_ops_map, bn):
   return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
 
 
-def _CreateFoldedOp(graph, context, has_scaling):
+def _GetBatchNormParams(graph, context, has_scaling):
+  """Extracts relevant tensors for folding batch norms.
+
+  Args:
+    graph: Graph to inspect.
+    context: The scope under which we look for batch norm params
+    has_scaling: Bool that specifies if scaling is done as part of batch
+    norm
+
+  Returns:
+   _BatchNormMatch containing all required batch norm parameters
+  """
+  gamma_tensor = None
+  batch_mean_tensor = None
+  batch_variance_tensor = None
+  moving_mean_tensor = None
+  moving_variance_tensor = None
+  batch_epsilon_tensor = None
+  bn_decay_mean_tensor = None
+  bn_decay_var_tensor = None
+
+  split_context = context.split('/')
+  base_context = split_context[-1]
+
+  oplist = graph.get_operations()
+  op_suffix_gamma = base_context + '/BatchNorm/gamma'
+  op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze'
+  op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1'
+  op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read'
+  op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read'
+  op_suffix_epsilon = base_context + '/BatchNorm/batchnorm/add/y'
+  op_suffix_bn_decay_mean = base_context + '/BatchNorm/AssignMovingAvg/decay'
+  op_suffix_bn_decay_var = base_context + '/BatchNorm/AssignMovingAvg_1/decay'
+
+  # Parse through list of ops to find relevant ops
+  for op in oplist:
+    if op.name.endswith(op_suffix_mean):
+      # This is an efficient way to check for two things:
+      # Is batch norm present and is it training mode?
+      # Batch statistics are computed only during batch norm in training
+      batch_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_variance):
+      batch_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_mean):
+      moving_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_variance):
+      moving_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_epsilon):
+      batch_epsilon_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_mean):
+      bn_decay_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_var):
+      bn_decay_var_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if has_scaling:
+      if op.name.endswith(op_suffix_gamma):
+        gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
+
+  if not has_scaling:
+    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+
+  return _BatchNormMatch(
+      layer_op=None,
+      bn_op=None,
+      output_tensor=None,
+      input_tensor=None,
+      weight_tensor=None,
+      gamma_tensor=gamma_tensor,
+      beta_tensor=None,
+      mean_tensor=batch_mean_tensor,
+      variance_tensor=batch_variance_tensor,
+      moving_mean_tensor=moving_mean_tensor,
+      moving_variance_tensor=moving_variance_tensor,
+      bn_decay_mean_tensor=bn_decay_mean_tensor,
+      bn_decay_var_tensor=bn_decay_var_tensor,
+      batch_epsilon_tensor=batch_epsilon_tensor)
+
+
+def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
+                                 fused_batch_norm):
+  """Computes batch norm correction params.
+
+     Before batch normalization is frozen:
+     We use batch statistics for batch norm.
+       correction_scale = sigma_b/sigma_mv
+       correction_recip = 1/correction_scale
+       correction_offset = 0
+
+     After batch normalization is frozen:
+      correction_scale = sigma_b/sigma_mv
+      correction_recip = 1
+      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).
+
+     Batch norm is frozen if global_step > bn_freeze_delay.
+     The corrections ensure that:
+     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
+     smoother training as the scaling on the weights changes slowly, rather than
+     jump across mini-batches
+     b) Changing the values of the corrections allows for one to switch between
+     using batch statistics to using moving mean and average, without requiring
+     changes to batch_norm
+
+
+  Args:
+    context: The scope under which we look for batch norm params
+    match: Object containg required batch norm tensors for correction
+      computation
+    freeze_batch_norm_delay: Delay in steps at which computation switches
+      from regular batch norm to frozen mean and variance.
+    fused_batch_norm: Bool, true if fused batch norm is used
+
+  Returns:
+    A tuple of correction_scale, correction_recip, correction_offset
+  """
+
+  g = ops.get_default_graph()
+  with g.name_scope(context + 'batch_norm_correction'):
+    recip_sigma_mv = math_ops.rsqrt(
+        match.moving_variance_tensor + match.batch_epsilon_tensor)
+    recip_sigma = math_ops.rsqrt(
+        match.variance_tensor + match.batch_epsilon_tensor)
+    correction_scale = math_ops.divide(
+        recip_sigma_mv, recip_sigma, name='scale_compute')
+    correction_scale = array_ops.identity(
+        correction_scale, name='correction_scale')
+    correction_recip = math_ops.reciprocal(
+        correction_scale, name='reciprocal_compute')
+    correction_offset = math_ops.multiply(
+        match.gamma_tensor,
+        match.mean_tensor * recip_sigma -
+        match.moving_mean_tensor * recip_sigma_mv,
+        name='offset_compute')
+
+    if freeze_batch_norm_delay is not None:
+      use_mv_avg = math_ops.greater_equal(
+          training_util.get_or_create_global_step(),
+          freeze_batch_norm_delay,
+          name='use_moving_average')
+    else:
+      use_mv_avg = False
+
+    bn_decay_zero = 0.0
+    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
+    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())
+
+    bn_decay_mean_out = utils.smart_cond(
+        use_mv_avg,
+        lambda: bn_decay_zero,
+        lambda: match.bn_decay_mean_tensor,
+        name='freeze_moving_mean')
+    graph_editor.reroute_ts(
+        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
+        can_modify=bn_decay_mean_consumers)
+
+    if fused_batch_norm is False:
+      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
+      bn_decay_var_out = utils.smart_cond(
+          use_mv_avg,
+          lambda: bn_decay_zero,
+          lambda: match.bn_decay_var_tensor,
+          name='freeze_moving_var')
+      graph_editor.reroute_ts(
+          [bn_decay_var_out], [match.bn_decay_var_tensor],
+          can_modify=bn_decay_var_consumers)
+
+    correction_recip = utils.smart_cond(
+        use_mv_avg,
+        lambda: array_ops.ones(correction_scale.shape),
+        lambda: correction_recip,
+        name='correction_recip')
+
+    correction_offset = utils.smart_cond(
+        use_mv_avg,
+        lambda: correction_offset,
+        lambda: array_ops.zeros(correction_offset.shape),
+        name='correction_offset')
+  return correction_scale, correction_recip, correction_offset
+
+
+def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
+                    is_training):
   """Folds in batch norm layer into preceding convolution or FC layer.
 
   Creates 3 new nodes, connects their inputs and adds them to the graph:
@@ -419,6 +743,9 @@ def _CreateFoldedOp(graph, context, has_scaling):
     context: String, batch norm context, i.e. node into which BatchNorm is
         nested.
     has_scaling: Whether the batch norm has scaling enabled.
+    freeze_batch_norm_delay: How many steps to wait before freezing
+    moving mean and variance and using them for batch normalization
+    is_training: Bool, true if training
 
   Raises:
     ValueError: When operation type is not supported, or input and output tensor
@@ -435,19 +762,43 @@ def _CreateFoldedOp(graph, context, has_scaling):
                                           mul_scale_name)
   op_below = mul_scale.inputs[0].op
   weights = op_below.inputs[1]
-
+  match = _GetBatchNormParams(
+      graph=graph, context=context, has_scaling=has_scaling)
+  correction_scale, correction_recip, correction_offset = None, None, None
+  if is_training:
+    correction_scale, correction_recip, correction_offset = (
+        _ComputeBatchNormCorrections(
+            context=context,
+            match=match,
+            freeze_batch_norm_delay=freeze_batch_norm_delay,
+            fused_batch_norm=False))
   # Special handling for weights of depthwise convolution.
   if op_below.type == 'DepthwiseConv2dNative':
-    new_shape = [weights.get_shape().as_list()[2],
-                 weights.get_shape().as_list()[3]]
+    new_shape = [
+        weights.get_shape().as_list()[2],
+        weights.get_shape().as_list()[3]
+    ]
     scale_name = 'mul' if has_scaling else 'Rsqrt'
-    scale = graph.get_operation_by_name(context + '/BatchNorm/batchnorm/' +
-                                        scale_name)
+    scale = graph.get_operation_by_name(
+        context + '/BatchNorm/batchnorm/' + scale_name)
     scale = array_ops.reshape(scale.outputs[0], new_shape,
                               context + '/scale_reshape')
-    mul_fold = _CloneOp(mul_scale, context + '/mul_fold',
-                        [(0, weights), (1, scale)])
+
+    if correction_scale is not None:
+      correction_scale = array_ops.reshape(correction_scale, new_shape,
+                                           context + '/correction_reshape')
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
+
+    mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights),
+                                                           (1, scale)])
   elif op_below.type in ['Conv2D', 'MatMul']:
+
+    if correction_scale is not None:
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
     mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights)])
   else:
     raise ValueError('Cannot handle operation of type: %s' % op_below.op)
@@ -456,10 +807,17 @@ def _CreateFoldedOp(graph, context, has_scaling):
   conv_or_fc_folded = _CloneOp(op_below, op_below.name + '_Fold',
                                [(1, mul_fold.outputs[0])])
 
-  add_shift = graph.get_operation_by_name(context +
-                                          '/BatchNorm/batchnorm/add_1')
-  add_fold = _CloneOp(add_shift, context + '/add_fold',
-                      [(0, conv_or_fc_folded.outputs[0])])
+  add_shift = graph.get_operation_by_name(
+      context + '/BatchNorm/batchnorm/add_1')
+
+  corrected_output = conv_or_fc_folded.outputs[0]
+  if correction_offset is not None:
+    with ops.device(conv_or_fc_folded.device):
+      corrected_output = math_ops.multiply(correction_recip, corrected_output,
+                                           context + '/post_conv_mul')
+      corrected_output = math_ops.add(corrected_output, (correction_offset),
+                                      context + '/correction_add')
+  add_fold = _CloneOp(add_shift, context + '/add_fold', [(0, corrected_output)])
   _AssertShapesMatch('add_fold', add_fold.inputs[0], add_fold.outputs[0])
   return add_shift, add_fold
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index ecf321ff573..330bd8a6474 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -46,26 +46,27 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
 
   def _RunTestOverParameters(self, test_fn):
     parameters_list = [
-        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm)
-        (nn_ops.relu6, 'Relu6', False, False, False),
-        (nn_ops.relu, 'Relu', False, False, False),
-        (nn_ops.relu6, 'Relu6', True, False, False),
-        (nn_ops.relu, 'Relu', True, False, False),
-        (nn_ops.relu6, 'Relu6', False, True, False),
-        (nn_ops.relu, 'Relu', False, True, False),
-        (nn_ops.relu6, 'Relu6', True, True, False),
-        (nn_ops.relu, 'Relu', True, True, False),
+        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm,
+        # freeze_batch_norm_delay)
+        (nn_ops.relu6, 'Relu6', False, False, False, 100),
+        (nn_ops.relu, 'Relu', False, False, False, None),
+        (nn_ops.relu6, 'Relu6', True, False, False, 100),
+        (nn_ops.relu, 'Relu', True, False, False, None),
+        (nn_ops.relu6, 'Relu6', False, True, False, 100),
+        (nn_ops.relu, 'Relu', False, True, False, None),
+        (nn_ops.relu6, 'Relu6', True, True, False, 100),
+        (nn_ops.relu, 'Relu', True, True, False, None),
         # Fused batch norm always has scaling enabled.
-        (nn_ops.relu6, 'Relu6', False, True, True),
-        (nn_ops.relu, 'Relu', False, True, True),
-        (nn_ops.relu6, 'Relu6', True, True, True),
-        (nn_ops.relu, 'Relu', True, True, True),
+        (nn_ops.relu6, 'Relu6', False, True, True, None),
+        (nn_ops.relu, 'Relu', False, True, True, 100),
+        (nn_ops.relu6, 'Relu6', True, True, True, None),
+        (nn_ops.relu, 'Relu', True, True, True, 100),
     ]
     for params in parameters_list:
-      test_fn(params[0], params[1], params[2], params[3], params[4])
+      test_fn(params[0], params[1], params[2], params[3], params[4], params[5])
 
   def _TestFoldConv2d(self, relu, relu_op_name, with_bypass, has_scaling,
-                      fused_batch_norm):
+                      fused_batch_norm, freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Args:
@@ -75,6 +76,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -99,12 +102,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -113,12 +117,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -128,7 +132,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2d)
 
   def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm):
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Tests that folding works even with an input shape where some dimensions are
@@ -141,6 +146,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -164,12 +171,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -177,12 +185,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold')
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -192,7 +200,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
   def _TestFoldFullyConnectedLayer(self, relu, relu_op_name, with_bypass,
-                                   has_scaling, fused_batch_norm):
+                                   has_scaling, fused_batch_norm,
+                                   freeze_batch_norm_delay):
     """Tests folding cases: inputs -> FC with batch norm -> Relu*.
 
     Args:
@@ -202,6 +211,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -223,12 +234,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/MatMul_Fold'])
@@ -237,12 +249,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'MatMul')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/MatMul_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -252,7 +264,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
 
   def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass,
-                               has_scaling, fused_batch_norm):
+                               has_scaling, fused_batch_norm,
+                               freeze_batch_norm_delay):
     """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*.
 
     Args:
@@ -262,6 +275,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -286,7 +301,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
@@ -295,8 +311,7 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     else:
       scale_reshape_op_name = scope + '/scale_reshape'
     self._AssertInputOpsAre(folded_mul,
-                            [scope + '/depthwise_weights/read',
-                             scale_reshape_op_name])
+                            [scope + '/correction_mult', scale_reshape_op_name])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])
 
     scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
@@ -311,12 +326,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/depthwise_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -326,7 +341,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
   def _TestCompareFoldAndUnfolded(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm):
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
     """Tests that running folded and unfolded BN returns the same results.
 
     Args:
@@ -336,6 +352,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     random_seed.set_random_seed(1234)
     unfolded_g = ops.Graph()
@@ -361,11 +379,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
       relu_node = relu(node, name='test/' + relu_op_name)
-
     folded_g = copy_graph.CopyGraph(unfolded_g)
     with folded_g.as_default():
-      fold_batch_norms.FoldBatchNorms(folded_g)
-
+      fold_batch_norms.FoldBatchNorms(
+          folded_g,
+          is_training=True,
+          freeze_batch_norm_delay=freeze_batch_norm_delay)
     with session.Session(graph=unfolded_g) as sess:
       sess.run(variables.global_variables_initializer())
       grad_node = gradients.gradients(relu_node, inputs)
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index e3581cc5590..b458f039df0 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -18,8 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 
-class OpTypePattern(object):
+
+class Pattern(object):
+  """The parent class of all patterns (e.g. OpTypePattern and OneofPattern)."""
+
+  @abc.abstractmethod
+  def match(self, op, tensor):
+    """Returns the result of matching op/tensor against this pattern."""
+    raise NotImplementedError('Method "match" not implemented.')
+
+
+class OpTypePattern(Pattern):
   """A tree pattern that matches TF expressions with certain op types."""
 
   def __init__(self, op_type, name=None, inputs=None):
@@ -34,7 +45,7 @@ class OpTypePattern(object):
         similar TF op types.
       name: Optional string. The name of the pattern that can be looked up in
         MatchResult.
-      inputs: Optional list of `OpTypePattern`s or strings that specify the
+      inputs: Optional list of `Pattern`s or strings that specify the
         patterns for the inputs of a matching op. If None, this pattern accepts
         any inputs of a matching op.
     """
@@ -43,22 +54,51 @@ class OpTypePattern(object):
     if inputs is None:
       inputs = []
     self._inputs = [
-        input_pattern if isinstance(input_pattern, OpTypePattern) else
-        OpTypePattern(input_pattern) for input_pattern in inputs
+        input_pattern
+        if isinstance(input_pattern, Pattern) else OpTypePattern(input_pattern)
+        for input_pattern in inputs
     ]
 
-  @property
-  def op_type(self):
-    return self._op_type
-
-  @property
-  def inputs(self):
-    return self._inputs
-
   @property
   def name(self):
     return self._name
 
+  def match(self, op, tensor):
+    if self._op_type != '*':
+      if op.type not in self._op_type.split('|'):
+        return None
+
+    match_result = MatchResult()
+    match_result.add(self, op, tensor)
+
+    if not self._inputs:
+      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
+      return match_result
+
+    if len(op.inputs) != len(self._inputs):
+      return None
+
+    for input_tensor, input_pattern in zip(op.inputs, self._inputs):
+      input_match_result = input_pattern.match(input_tensor.op, input_tensor)
+      if input_match_result is None:
+        return None
+      match_result.merge_from(input_match_result)
+    return match_result
+
+
+class OneofPattern(Pattern):
+  """Matches one of the given sub-patterns."""
+
+  def __init__(self, sub_patterns):
+    self._sub_patterns = sub_patterns
+
+  def match(self, op, tensor):
+    for sub_pattern in self._sub_patterns:
+      match_result = sub_pattern.match(op, tensor)
+      if match_result is not None:
+        return match_result
+    return None
+
 
 class MatchResult(object):
   r"""Encapsulates the result of a match done by GraphMatcher.
@@ -102,16 +142,36 @@ class MatchResult(object):
       return pattern_or_name
 
     if isinstance(pattern_or_name, str):
+      if pattern_or_name not in self._name_to_pattern:
+        return None
       return self._name_to_pattern[pattern_or_name]
 
     raise ValueError('pattern_or_name has type %s. Expect OpTypePattern or str.'
                      % type(pattern_or_name))
 
+  def _get_op_tensor(self, pattern_or_name):
+    pattern = self._to_pattern(pattern_or_name)
+    if pattern is None:
+      return None
+
+    if pattern not in self._pattern_to_op_tensor:
+      return None
+
+    return self._pattern_to_op_tensor[pattern]
+
   def get_op(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][0]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[0] if op_tensor else None
 
   def get_tensor(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][1]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[1] if op_tensor else None
+
+  def merge_from(self, other_match_result):
+    # pylint: disable=protected-access
+    self._pattern_to_op_tensor.update(other_match_result._pattern_to_op_tensor)
+    self._name_to_pattern.update(other_match_result._name_to_pattern)
+    # pylint: enable=protected-access
 
 
 class GraphMatcher(object):
@@ -121,7 +181,7 @@ class GraphMatcher(object):
     """Initializes a GraphMatcher.
 
     Args:
-      pattern: The `OpTypePattern` against which `GraphMatcher` matches
+      pattern: The `Pattern` against which `GraphMatcher` matches
         subgraphs.
     """
     self._pattern = pattern
@@ -133,7 +193,7 @@ class GraphMatcher(object):
     with key `pattern`.
 
     Args:
-      pattern: An `OpTypePattern`.
+      pattern: An `Pattern`.
       op: A `tf.Operation` to match against the pattern.
       tensor: the output `tf.Tensor` of `op` that is used by the matching op of
         `pattern`'s parent. Can be None if `pattern` is already the root of the
@@ -142,20 +202,11 @@ class GraphMatcher(object):
     Returns:
       True if an TF expression rooted at `op` matches `pattern`.
     """
-    if pattern.op_type != '*':
-      if op.type not in pattern.op_type.split('|'):
-        return False
-
-    self._match_result.add(pattern, op, tensor)
-
-    if not pattern.inputs:
-      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
-      return True
-
-    return len(op.inputs) == len(pattern.inputs) and all([
-        self._match_pattern(input_pattern, input_tensor.op, input_tensor)
-        for input_tensor, input_pattern in zip(op.inputs, pattern.inputs)
-    ])
+    match_result = pattern.match(op, tensor)
+    if match_result is None:
+      return False
+    self._match_result.merge_from(match_result)
+    return True
 
   def match_op(self, op):
     """Matches `op` against `self._pattern`.
diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py
index e1572865e42..6d587572181 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher_test.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py
@@ -105,7 +105,7 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
     self.assertEqual(match_result.get_op(y1_pattern), y1.op)
     self.assertEqual(match_result.get_tensor(y1_pattern), y1)
 
-  def test_oneof_pattern(self):
+  def test_oneof_type_pattern(self):
     #   -   +
     #  / \ / \
     # x   y   z
@@ -125,6 +125,44 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
         for match_result in matcher.match_graph(g)
     ], [plus.op, minus.op])
 
+  def test_oneof_pattern(self):
+    reshape_pattern = graph_matcher.OpTypePattern('Reshape')
+    transpose_pattern = graph_matcher.OneofPattern([
+        graph_matcher.OpTypePattern(
+            'Transpose',
+            name='transpose',
+            inputs=[
+                graph_matcher.OpTypePattern(
+                    'Slice', name='slice', inputs=[reshape_pattern, '*', '*']),
+                '*'
+            ]),
+        graph_matcher.OpTypePattern(
+            'Transpose', name='transpose', inputs=[reshape_pattern, '*'])
+    ])
+
+    matcher = graph_matcher.GraphMatcher(transpose_pattern)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      transpose = array_ops.transpose(reshape)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), None)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      slicing = array_ops.slice(reshape, [0, 0], [-1, -1])
+      transpose = array_ops.transpose(slicing)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), slicing)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index bbd9743d801..89b744c5591 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -52,9 +52,19 @@ def _create_graph(input_graph,
   """
   # TODO(suharshs): Describe the process in more detail in the doc string.
   g = copy_graph.CopyGraph(input_graph)
+  if is_training:
+    # TODO(raghuramank): Need to make freeze_batch_norm_delay
+    # a function of the batch size. For now setting this to 250 epochs
+    # This corresponds to 5 million steps at a batch size of 64.
+    freeze_batch_norm_delay = 5000000
+  else:
+    freeze_batch_norm_delay = None
   with g.as_default():
     with ops.device(device_name_or_function):
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g,
+          freeze_batch_norm_delay=freeze_batch_norm_delay,
+          is_training=is_training)
       quantize.Quantize(g, is_training=is_training)
   if elements is None:
     return g
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index 44998b3b659..bc383a80349 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -35,20 +35,34 @@ _VALID_PADDING = ["VALID", b"VALID"]
 _SAME_PADDING = ["SAME", b"SAME"]
 
 
-def _stride_size(node):
+def _stride_size(node, name_to_node):
   """Computes stride size given a TF node.
 
   Args:
     node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from variable name Tensorflow node.
 
   Returns:
     stride_x: Stride size for horizontal direction (integer).
     stride_y: Stride size for vertical direction (integer).
+
+  Raises:
+    ValueError: If stride input cannot be found in `name_to_node`.
   """
-  strides_attr = node.attr["strides"]
-  logging.vlog(4, "strides_attr = %s", strides_attr)
-  stride_y = strides_attr.list.i[1]
-  stride_x = strides_attr.list.i[2]
+  if node.op == "MaxPoolV2":
+    strides_input_name = node.input[2]
+    if not strides_input_name.endswith("/strides"):
+      raise ValueError("Strides name does not end with '/strides'")
+    strides_node = name_to_node[strides_input_name]
+    value = strides_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    stride_y = t[1]
+    stride_x = t[2]
+  else:
+    strides_attr = node.attr["strides"]
+    logging.vlog(4, "strides_attr = %s", strides_attr)
+    stride_y = strides_attr.list.i[1]
+    stride_x = strides_attr.list.i[2]
   return stride_x, stride_y
 
 
@@ -144,11 +158,12 @@ def _padding_size_conv_pool(node, kernel_size, stride, input_resolution=None):
   return total_padding, padding
 
 
-def _pool_kernel_size(node):
+def _pool_kernel_size(node, name_to_node):
   """Computes kernel size given a TF pooling node.
 
   Args:
     node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from node name to NodeDef.
 
   Returns:
     kernel_size_x: Kernel size for horizontal direction (integer).
@@ -157,13 +172,27 @@ def _pool_kernel_size(node):
   Raises:
     ValueError: If pooling is invalid.
   """
-  ksize = node.attr["ksize"]
-  kernel_size_y = ksize.list.i[1]
-  kernel_size_x = ksize.list.i[2]
-  if ksize.list.i[0] != 1:
-    raise ValueError("pool ksize for first dim is not 1")
-  if ksize.list.i[3] != 1:
-    raise ValueError("pool ksize for last dim is not 1")
+  if node.op == "MaxPoolV2":
+    ksize_input_name = node.input[1]
+    if not ksize_input_name.endswith("/ksize"):
+      raise ValueError("Kernel size name does not end with '/ksize'")
+    ksize_node = name_to_node[ksize_input_name]
+    value = ksize_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    kernel_size_y = t[1]
+    kernel_size_x = t[2]
+    if t[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if t[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
+  else:
+    ksize = node.attr["ksize"]
+    kernel_size_y = ksize.list.i[1]
+    kernel_size_x = ksize.list.i[2]
+    if ksize.list.i[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if ksize.list.i[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
   return kernel_size_x, kernel_size_y
 
 
@@ -243,7 +272,7 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
   logging.vlog(3, "node.op = %s", node.op)
   logging.vlog(4, "node = %s", node)
   if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
-    stride_x, stride_y = _stride_size(node)
+    stride_x, stride_y = _stride_size(node, name_to_node)
     kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
@@ -260,9 +289,9 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     stride_y = 1
     total_padding_x, padding_x, total_padding_y, padding_y = (
         _padding_size_pad_layer(node, name_to_node))
-  elif node.op == "MaxPool" or node.op == "AvgPool":
-    stride_x, stride_y = _stride_size(node)
-    kernel_size_x, kernel_size_y = _pool_kernel_size(node)
+  elif node.op == "MaxPool" or node.op == "MaxPoolV2" or node.op == "AvgPool":
+    stride_x, stride_y = _stride_size(node, name_to_node)
+    kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
         node, kernel_size_x, stride_x, input_resolution[1]
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
index c33804906fc..2def4f3f176 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <algorithm>
 #include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
+#include <algorithm>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
index 9bb1724a2c0..d8c0a0631d3 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define Sum(a, b) ((a) + (b))
 #define Prod(a, b) ((a) * (b))
@@ -58,11 +58,11 @@ inline T negative_infinity() {
 
 }  // namespace reduce_functions
 
-#define CALL_ALL_REDUCEOPS(func, ...)                                          \
-  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                    \
-  func(Prod, functor::reduce_functions::one, ##__VA_ARGS__)                    \
-  func(Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__)       \
-  func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
+#define CALL_ALL_REDUCEOPS(func, ...)                                       \
+  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                 \
+      func(Prod, functor::reduce_functions::one, ##__VA_ARGS__) func(       \
+          Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__) \
+          func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
 
 #define ReduceSliceFunctorReduceop(reduceop, dummy)                         \
   template <typename Device, typename T, typename Index>                    \
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 501cddb8c8f..9f2be03d718 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
index b8b56c0e229..92879ab5356 100644
--- a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
@@ -87,9 +87,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1, 2, 3]
-                        [ 0, 0, 0]
-                        [41,52,63]].
+the output will be [[ 1, 2, 3]
+                    [ 0, 0, 0]
+                    [41,52,63]].
 ```
 
 The data must be at least rank 1. The indices must be of shape (?,2) where the
@@ -132,9 +132,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1,  2,  3]
-                        [ 1,  1,  1]
-                        [40,100,180]].
+the output will be [[ 1,  2,  3]
+                    [ 1,  1,  1]
+                    [40,100,180]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -189,9 +189,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
-                        [        400,         20,         60]].
+the output will be [[          1,         20,          3]
+                    [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
+                    [        400,         20,         60]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -246,9 +246,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
-                        [          1,          5,          3]].
+the output will be [[          1,         20,          3]
+                    [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
+                    [          1,          5,          3]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index e02c1b6a2bd..63c72836d79 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -36,17 +36,12 @@ using GPUDevice = Eigen::GpuDevice;
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct Resampler2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     const int warp_batch_stride = num_sampling_points * 2;
     const int data_batch_stride = data_height * data_width * data_channels;
     const int output_batch_stride = num_sampling_points * data_channels;
@@ -59,24 +54,19 @@ struct Resampler2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto set_output = [&](const int sample_id,
-                              const int channel,
+        auto set_output = [&](const int sample_id, const int channel,
                               const T value) {
-          output[batch_id * output_batch_stride +
-                 sample_id * data_channels +
+          output[batch_id * output_batch_stride + sample_id * data_channels +
                  channel] = value;
         };
 
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -89,8 +79,7 @@ struct Resampler2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -103,12 +92,10 @@ struct Resampler2DFunctor<CPUDevice, T>{
 
             for (int chan = 0; chan < data_channels; ++chan) {
               const T img_fxfy = dx * dy * get_data_point(fx, fy, chan);
-              const T img_cxcy = (one - dx) * (one - dy) *
-                                   get_data_point(cx, cy, chan);
-              const T img_fxcy = dx * (one - dy) *
-                                   get_data_point(fx, cy, chan);
-              const T img_cxfy = (one - dx) * dy *
-                                   get_data_point(cx, fy, chan);
+              const T img_cxcy =
+                  (one - dx) * (one - dy) * get_data_point(cx, cy, chan);
+              const T img_fxcy = dx * (one - dy) * get_data_point(fx, cy, chan);
+              const T img_cxfy = (one - dx) * dy * get_data_point(cx, fy, chan);
               set_output(sample_id, chan,
                          img_fxfy + img_cxcy + img_fxcy + img_cxfy);
             }
@@ -125,8 +112,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
     // estimate of the cost of each work unit is needed to correctly shard the
     // workload. Shard assumes each cost unit is 1ns, minimum cost per shard
     // being 10us.
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, resample_batches);
@@ -138,8 +125,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
 template <typename Device, typename T>
 class ResamplerOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -158,16 +145,17 @@ class ResamplerOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
-                    "2, but shape vector is: ", warp_shape.DebugString()));
+                    "2, but shape vector is: ",
+                    warp_shape.DebugString()));
     OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0),
                 ::tensorflow::errors::InvalidArgument(
                     "Batch size of data and warp tensor must be the same, but "
-                    "input shapes are: ", data_shape.DebugString(), ", ",
-                    warp_shape.DebugString()));
+                    "input shapes are: ",
+                    data_shape.DebugString(), ", ", warp_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -180,16 +168,10 @@ class ResamplerOp : public ::tensorflow::OpKernel {
 
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::Resampler2DFunctor<Device, T>()(ctx,
-                                               ctx->eigen_device<Device>(),
-                                               data.flat<T>().data(),
-                                               warp.flat<T>().data(),
-                                               output->flat<T>().data(),
-                                               batch_size,
-                                               data_height,
-                                               data_width,
-                                               data_channels,
-                                               num_sampling_points);
+      functor::Resampler2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), output->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -197,12 +179,9 @@ class ResamplerOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp);
 };
 
-
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("Resampler")                      \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -211,40 +190,32 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("Resampler")                      \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerOp<GPUDevice, TYPE>)
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA
 
-
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct ResamplerGrad2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     // Set gradients to 0, because the kernel incrementally updates the
     // tensor entries by adding partial contributions.
-    const int resampler_output_size = batch_size * num_sampling_points *
-        data_channels;
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
     const int grad_warp_size = resampler_output_size / data_channels * 2;
-    const int grad_data_size = data_height * data_width * data_channels *
-        batch_size;
+    const int grad_data_size =
+        data_height * data_width * data_channels * batch_size;
     memset(grad_data, 0, sizeof(T) * grad_data_size);
     memset(grad_warp, 0, sizeof(T) * grad_warp_size);
 
@@ -260,35 +231,29 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
-            (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+              (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         auto update_grad_data = [&](const int x, const int y, const int chan,
                                     const T value) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
-          if (point_is_in_range){
+          if (point_is_in_range) {
             grad_data[batch_id * data_batch_stride +
-                      data_channels * (y * data_width + x) +
-                      chan] += value;
+                      data_channels * (y * data_width + x) + chan] += value;
           }
         };
 
-        auto update_grad_warp = [&](const int sample_id,
-                                    const int channel,
+        auto update_grad_warp = [&](const int sample_id, const int channel,
                                     const T value) {
-          grad_warp[batch_id * warp_batch_stride +
-                    sample_id * 2 +
-                    channel] += value;
+          grad_warp[batch_id * warp_batch_stride + sample_id * 2 + channel] +=
+              value;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -301,8 +266,7 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -316,27 +280,25 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
             for (int chan = 0; chan < data_channels; ++chan) {
               const T grad_output_value =
                   grad_output[batch_id * output_batch_stride +
-                              sample_id * data_channels +
-                              chan];
+                              sample_id * data_channels + chan];
               const T img_fxfy = get_data_point(fx, fy, chan);
               const T img_cxcy = get_data_point(cx, cy, chan);
               const T img_fxcy = get_data_point(fx, cy, chan);
               const T img_cxfy = get_data_point(cx, fy, chan);
 
               // Update partial gradients wrt relevant warp field entries
-              update_grad_warp(sample_id, 0,
-                               grad_output_value *
-                                   ((one - dy) * (img_cxcy - img_fxcy) +
-                                    dy * (img_cxfy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 0,
+                  grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) +
+                                       dy * (img_cxfy - img_fxfy)));
 
-              update_grad_warp(sample_id, 1,
-                               grad_output_value *
-                                   ((one - dx) * (img_cxcy - img_cxfy) +
-                                    dx * (img_fxcy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 1,
+                  grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) +
+                                       dx * (img_fxcy - img_fxfy)));
 
               // Update partial gradients wrt sampled data
-              update_grad_data(fx, fy, chan,
-                               grad_output_value * dx * dy);
+              update_grad_data(fx, fy, chan, grad_output_value * dx * dy);
               update_grad_data(cx, cy, chan,
                                grad_output_value * (one - dx) * (one - dy));
               update_grad_data(fx, cy, chan,
@@ -355,8 +317,8 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
     // being 10us.
     // TODO(fviola): Check out if there is a better way of doing this.
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, update_grads_for_batches);
   }
@@ -364,12 +326,11 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
 
 }  // namespace functor
 
-
 template <typename Device, typename T>
 class ResamplerGradOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -383,7 +344,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                     "tensor must be a batch of 2d data; data shape should have "
                     "4 entries corresponding to [batch_size, data_height, "
                     "data_width, data_channels], but is: ",
-                data_shape.DebugString()));
+                    data_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -394,7 +355,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
@@ -417,18 +378,11 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp));
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::ResamplerGrad2DFunctor<Device, T>()(ctx,
-                                                   ctx->eigen_device<Device>(),
-                                                   data.flat<T>().data(),
-                                                   warp.flat<T>().data(),
-                                                   grad_output.flat<T>().data(),
-                                                   grad_data->flat<T>().data(),
-                                                   grad_warp->flat<T>().data(),
-                                                   batch_size,
-                                                   data_height,
-                                                   data_width,
-                                                   data_channels,
-                                                   num_sampling_points);
+      functor::ResamplerGrad2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), grad_output.flat<T>().data(),
+          grad_data->flat<T>().data(), grad_warp->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -436,11 +390,9 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp);
 };
 
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("ResamplerGrad")                  \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerGradOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -449,11 +401,10 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("ResamplerGrad")                  \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerGradOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerGradOp<GPUDevice, TYPE>)
 // Disable half and double precision since atomicAdds are not supported
 // TF_CALL_half(REGISTER);
 // TF_CALL_double(REGISTER);
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h
index 85d3676efac..7fe3b9c0df7 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.h
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h
@@ -29,38 +29,25 @@ namespace functor {
 
 // Helper functor for the Resampler Op in 2D
 template <typename Device, typename T>
-struct Resampler2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct Resampler2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 // Helper functor for the Resampler Gradient Op in 2D
 template <typename Device, typename T>
-struct ResamplerGrad2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct ResamplerGrad2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
index 636847a212f..3c07051f685 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -31,18 +31,15 @@ using GPUDevice = Eigen::GpuDevice;
 
 namespace {
 
-#define GET_DATA_POINT(x, y)                   \
-  data[batch_id * data_batch_stride +          \
-       data_channels * (y * data_width + x) +  \
+#define GET_DATA_POINT(x, y)                                                 \
+  data[batch_id * data_batch_stride + data_channels * (y * data_width + x) + \
        chan]
 
 template <typename T>
 __global__ void Resampler2DKernel(const T* __restrict__ data,
                                   const T* __restrict__ warp,
-                                  T* __restrict__ output,
-                                  const int batch_size,
-                                  const int data_height,
-                                  const int data_width,
+                                  T* __restrict__ output, const int batch_size,
+                                  const int data_height, const int data_width,
                                   const int data_channels,
                                   const int num_sampling_points) {
   const int output_data_size = batch_size * num_sampling_points * data_channels;
@@ -75,10 +72,8 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -87,21 +82,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? dx * dy * GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy =
+          (fx >= 0 && fy >= 0) ? dx * dy * GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
+                             : zero;
 
       const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
-                         : zero;
+                             ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
+                             : zero;
 
       const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
-                         : zero;
+                             ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
+                             : zero;
 
       output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy;
     } else {
@@ -115,24 +109,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  const int output_data_size = batch_size * num_sampling_points * data_channels;
-  ::tensorflow::CudaLaunchConfig config =
-      ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
-  Resampler2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, output, batch_size, data_height, data_width,
-          data_channels, num_sampling_points);
+struct Resampler2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    const int output_data_size =
+        batch_size * num_sampling_points * data_channels;
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+    Resampler2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, output, batch_size, data_height, data_width,
+            data_channels, num_sampling_points);
   }
 };
 
@@ -145,26 +135,20 @@ template struct Resampler2DFunctor<GPUDevice, double>;
 
 namespace {
 
-#define UPDATE_GRAD_DATA_POINT(x, y, v)                  \
-  atomicAdd(grad_data + (batch_id * data_batch_stride +  \
-            data_channels * (y * data_width + x) +       \
-            chan),                                       \
+#define UPDATE_GRAD_DATA_POINT(x, y, v)                                \
+  atomicAdd(grad_data + (batch_id * data_batch_stride +                \
+                         data_channels * (y * data_width + x) + chan), \
             v)
 
-
 template <typename T>
-__global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
-                                      const T* __restrict__ warp,
-                                      const T* __restrict__ grad_output,
-                                      T* __restrict__ grad_data,
-                                      T* __restrict__ grad_warp,
-                                      const int batch_size,
-                                      const int data_height,
-                                      const int data_width,
-                                      const int data_channels,
-                                      const int num_sampling_points) {
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
+__global__ void ResamplerGrad2DKernel(
+    const T* __restrict__ data, const T* __restrict__ warp,
+    const T* __restrict__ grad_output, T* __restrict__ grad_data,
+    T* __restrict__ grad_warp, const int batch_size, const int data_height,
+    const int data_width, const int data_channels,
+    const int num_sampling_points) {
+  const int resampler_output_size =
+      batch_size * num_sampling_points * data_channels;
   CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
     const int out_index = index;
 
@@ -199,10 +183,8 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -211,21 +193,17 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy = (fx >= 0 && fy >= 0) ? GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? GET_DATA_POINT(cx, cy)
+                             : zero;
 
-      const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(fx, cy)
-                         : zero;
+      const T img_fxcy =
+          (fx >= 0 && cy <= data_height - 1) ? GET_DATA_POINT(fx, cy) : zero;
 
-      const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? GET_DATA_POINT(cx, fy)
-                         : zero;
+      const T img_cxfy =
+          (cx <= data_width - 1 && fy >= 0) ? GET_DATA_POINT(cx, fy) : zero;
 
       // Update partial gradients wrt relevant warp field entries
       atomicAdd(grad_warp + warp_id_x,
@@ -241,7 +219,7 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       }
       if (cx <= data_width - 1 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(cx, cy,
-                               grad_output_value  * (one - dx) * (one - dy));
+                               grad_output_value * (one - dx) * (one - dy));
       }
       if (fx >= 0 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy));
@@ -261,43 +239,37 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  // Set gradients to 0, because the kernel incrementally updates the
-  // tensor entries by adding partial contributions.
-  const int grad_warp_size = batch_size * num_sampling_points * 2;
-  const int grad_data_size = batch_size * data_height * data_width *
-      data_channels;
+struct ResamplerGrad2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    // Set gradients to 0, because the kernel incrementally updates the
+    // tensor entries by adding partial contributions.
+    const int grad_warp_size = batch_size * num_sampling_points * 2;
+    const int grad_data_size =
+        batch_size * data_height * data_width * data_channels;
 
-  ::tensorflow::CudaLaunchConfig config =
-     ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_warp_size, grad_warp);
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_warp_size, grad_warp);
 
-  config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_data_size, grad_data);
+    config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_data_size, grad_data);
 
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
-  config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
-  ResamplerGrad2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, grad_output, grad_data, grad_warp, batch_size,
-          data_height, data_width, data_channels, num_sampling_points);
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
+    config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+    ResamplerGrad2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, grad_output, grad_data, grad_warp, batch_size,
+            data_height, data_width, data_channels, num_sampling_points);
   }
 };
 
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index e62501e9b10..03006dab323 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -36,11 +36,10 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 
 namespace functor {
 template <typename T>
-void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx,
-                                     bool transa, bool transb, uint64 m,
-                                     uint64 n, uint64 k, T alpha, const T* a,
-                                     int lda, const T* b, int ldb, T beta, T* c,
-                                     int ldc) {
+void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
+                                     bool transb, uint64 m, uint64 n, uint64 k,
+                                     T alpha, const T* a, int lda, const T* b,
+                                     int ldb, T beta, T* c, int ldc) {
 #if GOOGLE_CUDA
   perftools::gputools::blas::Transpose trans[] = {
       perftools::gputools::blas::Transpose::kNoTranspose,
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.cc b/tensorflow/contrib/rnn/kernels/gru_ops.cc
index 0796f82b214..bd3d898fb09 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/gru_ops.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -61,9 +61,9 @@ class GRUCellBlockOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -82,10 +82,10 @@ class GRUCellBlockOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -97,10 +97,10 @@ class GRUCellBlockOp : public OpKernel {
                 errors::InvalidArgument("Rank of b_ru must be 1",
                                         b_ru_tensor->dims(), " vs. 1", 1));
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1",
                                         b_c_tensor->dims(), " vs. 1"));
@@ -216,9 +216,9 @@ class GRUBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -237,10 +237,10 @@ class GRUBlockCellGradOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -253,54 +253,54 @@ class GRUBlockCellGradOp : public OpKernel {
                                         b_ru_tensor->dims(), " vs. 1"));
 
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1 ",
                                         b_c_tensor->dims(), " vs. 1"));
 
     // Shape of 'r' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("r.dims(0) != batch_size: ",
-                                r_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("r.dims(1) != cell_size: ",
-                                r_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "r.dims(0) != batch_size: ", r_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "r.dims(1) != cell_size: ", r_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'u' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("u.dims(0) != batch_size: ",
-                                u_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("u.dims(1) != cell_size: ",
-                                u_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "u.dims(0) != batch_size: ", u_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "u.dims(1) != cell_size: ", u_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'c' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("c.dims(0) != batch_size: ",
-                                c_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("c.dims(1) != cell_size: ",
-                                c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "c.dims(0) != batch_size: ", c_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "c.dims(1) != cell_size: ", c_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'd_h' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("d_h.dims(0) != batch_size: ",
-                                d_h_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("d_h.dims(1) != cell_size: ",
-                                d_h_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "d_h.dims(0) != batch_size: ", d_h_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "d_h.dims(1) != cell_size: ", d_h_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Create output tensors.
     Tensor* d_x_tensor = nullptr;
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index 941a457fd3a..5e7cf0ce84d 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -281,23 +281,23 @@ class LSTMBlockCellOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     // Allocate our output tensors.
     Tensor* i_tensor = nullptr;
@@ -484,77 +484,77 @@ class LSTMBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("i.dim_size(0) != batch_size: ",
-                                i_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                i_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "i.dim_size(0) != batch_size: ", i_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", i_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("cs.dim_size(0) != batch_size: ",
-                                cs_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("cs.dim_size(1) != cell_size: ",
-                                cs_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(0) != batch_size: ", cs_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(1) != cell_size: ", cs_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("f.dim_size(0) != batch_size: ",
-                                f_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                f_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "f.dim_size(0) != batch_size: ", f_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", f_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("o.dim_size(0) != batch_size: ",
-                                o_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("o.dim_size(1) != cell_size: ",
-                                o_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "o.dim_size(0) != batch_size: ", o_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "o.dim_size(1) != cell_size: ", o_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("ci.dim_size(0) != batch_size: ",
-                                ci_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("ci.dim_size(1) != cell_size: ",
-                                ci_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(0) != batch_size: ", ci_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(1) != cell_size: ", ci_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("co.dim_size(0) != batch_size: ",
-                                co_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("co.dim_size(1) != cell_size: ",
-                                co_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "co.dim_size(0) != batch_size: ", co_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "co.dim_size(1) != cell_size: ", co_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, cs_grad_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
@@ -860,9 +860,9 @@ class BlockLSTMOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     const Tensor* w_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
@@ -872,46 +872,46 @@ class BlockLSTMOp : public OpKernel {
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
     OP_REQUIRES(ctx, wci_tensor->dims() == 1,
                 errors::InvalidArgument("wci must be 1D"));
-    OP_REQUIRES(
-        ctx, wci_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wci.dim_size(0) != cell_size: ",
-                                wci_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wci_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wci.dim_size(0) != cell_size: ", wci_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wcf_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
     OP_REQUIRES(ctx, wcf_tensor->dims() == 1,
                 errors::InvalidArgument("wcf must be 1D"));
-    OP_REQUIRES(
-        ctx, wcf_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wcf.dim_size(0) != cell_size: ",
-                                wcf_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wcf_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wcf.dim_size(0) != cell_size: ", wcf_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wco_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
     OP_REQUIRES(ctx, wco_tensor->dims() == 1,
                 errors::InvalidArgument("wco must be 1D"));
-    OP_REQUIRES(
-        ctx, wco_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wco.dim_size(0) != cell_size: ",
-                                wco_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wco_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wco.dim_size(0) != cell_size: ", wco_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* b_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
     OP_REQUIRES(ctx, b_tensor->dims() == 1,
                 errors::InvalidArgument("b must be 1D"));
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     TensorShape batch_cell_shape({timelen, batch_size, cell_size});
     Tensor* i_out;
@@ -1065,9 +1065,9 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
     const int64 cell_size = w_tensor->dim_size(1) / 4;
     OP_REQUIRES(ctx, input_size + cell_size == w_tensor->dim_size(0),
-                errors::InvalidArgument("w matrix rows don't match: ",
-                                        input_size + cell_size, " vs. ",
-                                        w_tensor->dim_size(0)));
+                errors::InvalidArgument(
+                    "w matrix rows don't match: ", input_size + cell_size,
+                    " vs. ", w_tensor->dim_size(0)));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
@@ -1193,7 +1193,6 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                            batch_cell_shape, &h_grad_tensor));
 
-
     const Device& device = ctx->eigen_device<Device>();
 
     functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<float>());
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index bc6b85f3f1a..d23cedc234b 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -92,7 +92,6 @@ struct TensorZeroPadding {
   }
 };
 
-
 struct LSTMBlockCell {
   LSTMBlockCell(const int batch_size, const int input_size, const int cell_size)
       : batch_size_(batch_size),
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
index 544cd163c50..68184b643e5 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
@@ -149,8 +149,9 @@ TEST_F(LSTMOpsTest, BlockLSTMGrad_ShapeFn) {
   INFER_ERROR("must be rank 1", op, "?;?;?;?;?;?;?;?;[1,?]" + suffix);
 
   // Output with all input knowns makes known rank outputs.
-  INFER_OK(op, JoinedCopies("?", 18), "[?,?,?];" + JoinedCopies("[?,?]", 3) +
-                                          ";" + JoinedCopies("[?]", 4));
+  INFER_OK(
+      op, JoinedCopies("?", 18),
+      "[?,?,?];" + JoinedCopies("[?,?]", 3) + ";" + JoinedCopies("[?]", 4));
 
   // Output with copies input shapes to output.
   string input = strings::StrCat("?;[?,?,?];", JoinedCopies("[?,?]", 3), ";",
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index cafeb56ad88..9b84635e85e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.framework import test_util
 from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 
-
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 
@@ -84,19 +83,22 @@ class RNNCellTest(test.TestCase):
         ], [v.name for v in cell.trainable_variables])
         self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellNotTrainable(self):
     with self.test_session() as sess:
+
       def not_trainable_getter(getter, *args, **kwargs):
         kwargs["trainable"] = False
         return getter(*args, **kwargs)
 
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5),
+          "root",
+          initializer=init_ops.constant_initializer(0.5),
           custom_getter=not_trainable_getter):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
@@ -108,9 +110,10 @@ class RNNCellTest(test.TestCase):
             "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
         ], [v.name for v in cell.non_trainable_variables])
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testGRUCell(self):
@@ -121,9 +124,10 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.175991, 0.175991]])
       with variable_scope.variable_scope(
@@ -133,10 +137,10 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g],
-            {x.name: np.array([[1., 1., 1.]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.156736, 0.156736]])
 
@@ -148,11 +152,26 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
+
+  def testSRUCellWithDiffSize(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
+            [g], {x.name: np.array([[1., 1., 1.]]),
                   m.name: np.array([[0.1, 0.1]])})
         # Smoke test
-        self.assertAllClose(res[0], [[0.509682,  0.509682]])
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
 
   def testBasicLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
@@ -164,8 +183,7 @@ class RNNCellTest(test.TestCase):
           m = array_ops.zeros([1, 8], dtype=dtype)
           cell = rnn_cell_impl.MultiRNNCell(
               [
-                  rnn_cell_impl.BasicLSTMCell(
-                      2, state_is_tuple=False)
+                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
                   for _ in range(2)
               ],
               state_is_tuple=False)
@@ -183,22 +201,21 @@ class RNNCellTest(test.TestCase):
               "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
               rnn_cell_impl._BIAS_VARIABLE_NAME
           ]
-          self.assertEqual(
-              expected_variable_names,
-              [v.name for v in cell.trainable_variables])
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
           self.assertFalse(cell.non_trainable_variables)
           sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1.]]),
-               m.name: 0.1 * np.ones([1, 8])})
+          res = sess.run([g, out_m], {
+              x.name: np.array([[1., 1.]]),
+              m.name: 0.1 * np.ones([1, 8])
+          })
           self.assertEqual(len(res), 2)
           variables = variables_lib.global_variables()
           self.assertEqual(expected_variable_names, [v.name for v in variables])
           # The numbers in results were not calculated, this is just a
           # smoke test.
-          self.assertAllClose(
-              res[0], np.array([[0.240, 0.240]], dtype=np_dtype), 1e-2)
+          self.assertAllClose(res[0], np.array(
+              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
           expected_mem = np.array(
               [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
               dtype=np_dtype)
@@ -208,13 +225,13 @@ class RNNCellTest(test.TestCase):
           # Test BasicLSTMCell with input_size != num_units.
           x = array_ops.zeros([1, 3], dtype=dtype)
           m = array_ops.zeros([1, 4], dtype=dtype)
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              2, state_is_tuple=False)(x, m)
+          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
           res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-               m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)})
+              [g, out_m], {
+                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
+              })
           self.assertEqual(len(res), 2)
 
   def testBasicLSTMCellDimension0Error(self):
@@ -232,9 +249,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size - 1, state_size])
+              })
 
   def testBasicLSTMCellStateSizeError(self):
     """Tests that state_size must be num_units * 2."""
@@ -251,9 +270,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size, state_size])
+              })
 
   def testBasicLSTMCellStateTupleType(self):
     with self.test_session():
@@ -301,11 +322,12 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=True)
         g, (out_m0, out_m1) = cell(x, (m0, m1))
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, out_m0, out_m1], {
-            x.name: np.array([[1., 1.]]),
-            m0.name: 0.1 * np.ones([1, 4]),
-            m1.name: 0.1 * np.ones([1, 4])
-        })
+        res = sess.run(
+            [g, out_m0, out_m1], {
+                x.name: np.array([[1., 1.]]),
+                m0.name: 0.1 * np.ones([1, 4]),
+                m1.name: 0.1 * np.ones([1, 4])
+            })
         self.assertEqual(len(res), 3)
         # The numbers in results were not calculated, this is just a smoke test.
         # Note, however, these values should match the original
@@ -336,10 +358,11 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=False)
         output, state = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
-            m.name: 0.1 * np.ones((batch_size, state_size))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
+                m.name: 0.1 * np.ones((batch_size, state_size))
+            })
         self.assertEqual(len(res), 2)
         # The numbers in results were not calculated, this is mostly just a
         # smoke test.
@@ -442,10 +465,10 @@ class RNNCellTest(test.TestCase):
             rnn_cell_impl.GRUCell(3), num_proj=3)
         g, new_m = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1., 1.]]),
-             m.name: np.array([[0.1, 0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 3))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
@@ -479,9 +502,11 @@ class RNNCellTest(test.TestCase):
         base_cell = rnn_cell_impl.GRUCell(3)
         g, m_new = base_cell(x, m)
         variable_scope.get_variable_scope().reuse_variables()
+
         def residual_with_slice_fn(inp, out):
           inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
           return inp_sliced + out
+
         g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
             base_cell, residual_with_slice_fn)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
@@ -551,10 +576,10 @@ class RNNCellTest(test.TestCase):
         self.assertEqual(embedding_cell.output_size, 2)
         g, new_m = embedding_cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 2))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.17139, 0.17139]])
@@ -584,8 +609,8 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 4])
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=False)(x, m)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(ml, {
             x.name: np.array([[1., 1.]]),
@@ -605,19 +630,20 @@ class RNNCellTest(test.TestCase):
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
           rnn_cell_impl.MultiRNNCell(
-              [rnn_cell_impl.GRUCell(2)
-               for _ in range(2)], state_is_tuple=True)(x, m_bad)
+              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
 
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=True)(x, m_good)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
 
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m_good[0].name: np.array([[0.1, 0.1]]),
-            m_good[1].name: np.array([[0.1, 0.1]])
-        })
+        res = sess.run(
+            ml, {
+                x.name: np.array([[1., 1.]]),
+                m_good[0].name: np.array([[0.1, 0.1]]),
+                m_good[1].name: np.array([[0.1, 0.1]])
+            })
 
         # The numbers in results were not calculated, this is just a
         # smoke test.  However, these numbers should match those of
@@ -628,8 +654,11 @@ class RNNCellTest(test.TestCase):
 
 class DropoutWrapperTest(test.TestCase):
 
-  def _testDropoutWrapper(self, batch_size=None, time_steps=None,
-                          parallel_iterations=None, **kwargs):
+  def _testDropoutWrapper(self,
+                          batch_size=None,
+                          time_steps=None,
+                          parallel_iterations=None,
+                          **kwargs):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -640,14 +669,14 @@ class DropoutWrapperTest(test.TestCase):
           x = constant_op.constant(
               [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
           m = rnn_cell_impl.LSTMStateTuple(
-              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)
-               ] * 2)
+              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32
+                                    )] * 2)
         else:
           x = constant_op.constant(
               np.random.randn(time_steps, batch_size, 3).astype(np.float32))
           m = rnn_cell_impl.LSTMStateTuple(*[
-              constant_op.constant(
-                  [[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
+              constant_op.
+              constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
           ] * 2)
         outputs, final_state = rnn.dynamic_rnn(
             cell=rnn_cell_impl.DropoutWrapper(
@@ -674,8 +703,8 @@ class DropoutWrapperTest(test.TestCase):
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -687,8 +716,8 @@ class DropoutWrapperTest(test.TestCase):
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -703,16 +732,20 @@ class DropoutWrapperTest(test.TestCase):
     ## consistent across both calls.  Otherwise the seed may not end
     ## up being munged consistently across both graphs.
     res_standard_1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     # Clear away the graph and the test session (which keeps variables around)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2)
     res_standard_2 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     self.assertAllClose(res_standard_1[0], res_standard_2[0])
     self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
@@ -722,11 +755,12 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_none,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_none,
         state_keep_prob=keep_all)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(np.zeros(res[0].shape), res[0])
@@ -739,13 +773,13 @@ class DropoutWrapperTest(test.TestCase):
     # Even though we dropout state, by default DropoutWrapper never
     # drops out the memory ("c") term of an LSTMStateTuple.
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_all,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_none)
-    true_c_state = np.array(
-        [[1.713925, 1.713925, 1.713925]], dtype=np.float32)
+    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     self.assertAllClose(true_full_output[0], res[0][0])
     # Second output is modified by zero input state
     self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
@@ -758,13 +792,14 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     # All outputs are different because inputs are zeroed out
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_none, output_keep_prob=keep_all,
+        input_keep_prob=keep_none,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_all)
     self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
     self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
@@ -774,9 +809,13 @@ class DropoutWrapperTest(test.TestCase):
     keep_some = 0.8
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_some,
-        state_keep_prob=keep_all, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_all,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
     # Ensure the same dropout pattern for all time steps
     output_mask = np.abs(res[0]) > 1e-6
     for m in output_mask[1:]:
@@ -785,9 +824,13 @@ class DropoutWrapperTest(test.TestCase):
   def testDropoutWrapperRecurrentStateInputAndOutput(self):
     keep_some = 0.9
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
 
     # Smoke test for the state/input masks.
     output_mask = np.abs(res[0]) > 1e-6
@@ -811,17 +854,27 @@ class DropoutWrapperTest(test.TestCase):
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res0 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
 
     output_mask = np.abs(res0[0]) > 1e-6
     for time_step in output_mask:
@@ -858,9 +911,10 @@ class SlimRNNCellTest(test.TestCase):
         g, _ = rnn_cell_impl._SlimRNNCell(my_cell)(x, m)
         # pylint: enable=protected-access
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellMatch(self):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 0258d7202df..57521c6a9ba 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 
+
 class Plus1RNNCell(rnn_lib.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
@@ -160,8 +161,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8  # unrolled up to this length
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
     self.assertEqual(len(outputs), len(inputs))
@@ -178,10 +178,9 @@ class RNNTest(test.TestCase):
         self.assertAllClose(v, input_value + 1.0)
 
       # Final state
-      self.assertAllClose(
-          values[-1],
-          max_length * np.ones(
-              (batch_size, input_size), dtype=np.float32))
+      self.assertAllClose(values[-1],
+                          max_length * np.ones(
+                              (batch_size, input_size), dtype=np.float32))
 
   def testDropout(self):
     cell = Plus1RNNCell()
@@ -191,8 +190,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("share_scope"):
       outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
@@ -207,8 +205,10 @@ class RNNTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       input_value = np.random.randn(batch_size, input_size)
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      full_dropout_values = sess.run(dropped_outputs,
-                                     feed_dict={inputs[0]: input_value})
+      full_dropout_values = sess.run(
+          dropped_outputs, feed_dict={
+              inputs[0]: input_value
+          })
 
       for v in values[:-1]:
         self.assertAllClose(v, input_value + 1.0)
@@ -222,8 +222,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("drop_scope"):
       dynamic_outputs, dynamic_state = rnn.static_rnn(
@@ -234,12 +233,16 @@ class RNNTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       dynamic_values = sess.run(
           dynamic_outputs,
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       dynamic_state_value = sess.run(
           [dynamic_state],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # outputs are fully calculated for t = 0, 1
       for v in dynamic_values[:2]:
@@ -289,8 +292,7 @@ class RNNTest(test.TestCase):
       input_size = 5
       max_length = 8  # unrolled up to this length
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope=scope)
 
@@ -316,8 +318,7 @@ class LSTMTest(test.TestCase):
       cell = rnn_cell.LSTMCell(
           num_units, initializer=initializer, state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -343,8 +344,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -374,8 +374,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -388,7 +387,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       (last_state_value, saved_state_value) = sess.run(
           [state, state_saver.saved_state["save_lstm"]],
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(last_state_value, saved_state_value)
 
   def testNoProjNoShardingTupleStateSaver(self):
@@ -406,8 +407,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=True)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -420,7 +420,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       last_and_saved_states = sess.run(
           state + (state_saver.saved_state["c"], state_saver.saved_state["m"]),
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(4, len(last_and_saved_states))
       self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
 
@@ -432,16 +434,17 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, {
-          "c0": num_units,
-          "m0": num_units,
-          "c1": num_units + 1,
-          "m1": num_units + 1,
-          "c2": num_units + 2,
-          "m2": num_units + 2,
-          "c3": num_units + 3,
-          "m3": num_units + 3
-      })
+      state_saver = TestStateSaver(
+          batch_size, {
+              "c0": num_units,
+              "m0": num_units,
+              "c1": num_units + 1,
+              "m1": num_units + 1,
+              "c2": num_units + 2,
+              "m2": num_units + 2,
+              "c3": num_units + 3,
+              "m3": num_units + 3
+          })
 
       def _cell(i):
         return rnn_cell.LSTMCell(
@@ -459,8 +462,7 @@ class LSTMTest(test.TestCase):
         self.assertEqual(len(cell.state_size[i]), 2)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
       state_names = (("c0", "m0"), ("c1", "m1"), ("c2", "m2"), ("c3", "m3"))
@@ -475,10 +477,15 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      last_states = sess.run(list(nest.flatten(state)),
-                             feed_dict={inputs[0]: input_value})
-      saved_states = sess.run(list(state_saver.saved_state.values()),
-                              feed_dict={inputs[0]: input_value})
+      last_states = sess.run(
+          list(nest.flatten(state)), feed_dict={
+              inputs[0]: input_value
+          })
+      saved_states = sess.run(
+          list(state_saver.saved_state.values()),
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(8, len(last_states))
       self.assertEqual(8, len(saved_states))
       flat_state_names = nest.flatten(state_names)
@@ -499,8 +506,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -526,8 +532,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell_notuple = rnn_cell.LSTMCell(
           num_units,
@@ -569,14 +574,20 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      outputs_notuple_v = sess.run(outputs_notuple,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_tuple_v = sess.run(outputs_tuple,
-                                 feed_dict={inputs[0]: input_value})
+      outputs_notuple_v = sess.run(
+          outputs_notuple, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_tuple_v = sess.run(
+          outputs_tuple, feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(outputs_notuple_v, outputs_tuple_v)
 
-      (state_notuple_v,) = sess.run((state_notuple,),
-                                    feed_dict={inputs[0]: input_value})
+      (state_notuple_v,) = sess.run(
+          (state_notuple,), feed_dict={
+              inputs[0]: input_value
+          })
       state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value})
       self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
 
@@ -593,8 +604,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -625,8 +635,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -661,8 +670,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       initializer = init_ops.constant_initializer(0.001)
 
@@ -721,8 +729,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -743,16 +750,21 @@ class LSTMTest(test.TestCase):
 
       self.assertEqual(len(outputs), len(inputs))
 
-      variables_lib.global_variables_initializer().run(
-          feed_dict={sequence_length: [2, 3]})
+      variables_lib.global_variables_initializer().run(feed_dict={
+          sequence_length: [2, 3]
+      })
       input_value = np.asarray(
           np.random.randn(batch_size, input_size), dtype=np.float64)
       values = sess.run(
-          outputs, feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          outputs, feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       state_value = sess.run(
-          [state], feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          [state], feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       self.assertEqual(values[0].dtype, input_value.dtype)
       self.assertEqual(state_value[0].dtype, input_value.dtype)
 
@@ -767,8 +779,7 @@ class LSTMTest(test.TestCase):
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -792,8 +803,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1 + outputs2,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1 + outputs2, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:2 * max_length]
       outputs2_values = output_values[2 * max_length:]
@@ -814,8 +827,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -833,8 +845,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:]
       self.assertEqual(len(outputs0_values), len(outputs1_values))
@@ -861,8 +875,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -939,8 +952,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -1100,8 +1112,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         static_grad_values = sess.run(static_gradients, feed_dict=feeds)
 
-        static_individual_grad_values = sess.run(static_individual_gradients,
-                                                 feed_dict=feeds)
+        static_individual_grad_values = sess.run(
+            static_individual_gradients, feed_dict=feeds)
 
         static_individual_var_grad_values = sess.run(
             static_individual_variable_gradients, feed_dict=feeds)
@@ -1148,8 +1160,10 @@ class LSTMTest(test.TestCase):
         # Generate gradients of several individual outputs w.r.t. inputs
         dynamic_individual_gradients = nest.flatten([
             gradients_impl.gradients(y, [concat_inputs])
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Generate gradients of individual variables w.r.t. inputs
@@ -1159,8 +1173,10 @@ class LSTMTest(test.TestCase):
             "Count of trainable variables: %d" % len(trainable_variables))
         dynamic_individual_variable_gradients = nest.flatten([
             gradients_impl.gradients(y, trainable_variables)
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Test forward pass
@@ -1170,8 +1186,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         dynamic_grad_values = sess.run(dynamic_gradients, feed_dict=feeds)
 
-        dynamic_individual_grad_values = sess.run(dynamic_individual_gradients,
-                                                  feed_dict=feeds)
+        dynamic_individual_grad_values = sess.run(
+            dynamic_individual_gradients, feed_dict=feeds)
 
         dynamic_individual_var_grad_values = sess.run(
             dynamic_individual_variable_gradients, feed_dict=feeds)
@@ -1207,8 +1223,8 @@ class LSTMTest(test.TestCase):
       for i, (a, b) in enumerate(
           zip(static_individual_var_grad_values,
               dynamic_individual_var_grad_values)):
-        tf_logging.info("Comparing individual variable gradients iteration %d" %
-                        i)
+        tf_logging.info(
+            "Comparing individual variable gradients iteration %d" % i)
         self.assertAllEqual(a, b)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -1223,10 +1239,7 @@ class BidirectionalRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _createBidirectionalRNN(self,
-                              use_shape,
-                              use_sequence_length,
-                              scope=None):
+  def _createBidirectionalRNN(self, use_shape, use_sequence_length, scope=None):
     num_units = 3
     input_size = 5
     batch_size = 2
@@ -1270,8 +1283,10 @@ class BidirectionalRNNTest(test.TestCase):
       # Run with pre-specified sequence length of 2, 3
       out, s_fw, s_bw = sess.run(
           [outputs, state_fw, state_bw],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1312,8 +1327,10 @@ class BidirectionalRNNTest(test.TestCase):
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
-      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
-                                 feed_dict={inputs[0]: input_value})
+      out, s_fw, s_bw = sess.run(
+          [outputs, state_fw, state_bw], feed_dict={
+              inputs[0]: input_value
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1396,13 +1413,11 @@ class BidirectionalRNNTest(test.TestCase):
                                    use_time_major, use_sequence_length):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
-          self._createBidirectionalDynamicRNN(use_shape,
-                                              use_state_tuple, use_time_major,
-                                              use_sequence_length))
+          self._createBidirectionalDynamicRNN(
+              use_shape, use_state_tuple, use_time_major, use_sequence_length))
       variables_lib.global_variables_initializer().run()
       # Run with pre-specified sequence length of 2, 3
-      feed_dict = (
-          {sequence_length: [2, 3]} if use_sequence_length else {})
+      feed_dict = ({sequence_length: [2, 3]} if use_sequence_length else {})
       feed_dict.update({inputs[0]: input_value})
       if use_state_tuple:
         out, c_fw, m_fw, c_bw, m_bw = sess.run(
@@ -1538,8 +1553,7 @@ class MultiDimensionalLSTMTest(test.TestCase):
     sequence_length = [4, 6]
     with self.test_session(graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None,) + input_size)
+          array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
       inputs_using_dim = max_length * [
           array_ops.placeholder(
@@ -1585,14 +1599,22 @@ class MultiDimensionalLSTMTest(test.TestCase):
 
       input_total_size = (batch_size,) + input_size
       input_value = np.random.randn(*input_total_size)
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={inputs[0]: input_value})
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={inputs_using_dim[0]: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={inputs_using_dim[0]: input_value})
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
 
       self.assertAllEqual(outputs_static_v, outputs_dynamic_v)
       self.assertAllEqual(outputs_static_v, outputs_sav_v)
@@ -1602,16 +1624,26 @@ class MultiDimensionalLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_static_v = sess.run(state_static,
-                                feed_dict={inputs[0]: input_value})
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={inputs[0]: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={inputs_using_dim[0]: input_value})
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              inputs[0]: input_value
+          })
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1633,16 +1665,17 @@ class NestedLSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
-          dtypes.float32, shape=(None, input_size)), array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size)))
+          dtypes.float32, shape=(None, input_size)),
+                      array_ops.placeholder(
+                          dtypes.float32, shape=(None, input_size)))
       inputs = max_length * [single_input]
       inputs_c = (array_ops.stack([input_[0] for input_ in inputs]),
                   array_ops.stack([input_[1] for input_ in inputs]))
-      single_input_using_dim = (
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)),
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)))
+      single_input_using_dim = (array_ops.placeholder(
+          dtypes.float32, shape=(batch_size, input_size)),
+                                array_ops.placeholder(
+                                    dtypes.float32,
+                                    shape=(batch_size, input_size)))
       inputs_using_dim = max_length * [single_input_using_dim]
 
       # Create a cell for the whole test. This is fine because the cell has no
@@ -1688,14 +1721,22 @@ class NestedLSTMTest(test.TestCase):
       input_total_size = (batch_size, input_size)
       input_value = (np.random.randn(*input_total_size),
                      np.random.randn(*input_total_size))
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={single_input: input_value})
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={single_input: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={single_input_using_dim: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={single_input_using_dim: input_value})
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              single_input: input_value
+          })
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              single_input: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              single_input_using_dim: input_value
+          })
 
       self.assertAllEqual(outputs_static_v,
                           np.transpose(outputs_dynamic_v, (1, 0, 2, 3)))
@@ -1706,16 +1747,26 @@ class NestedLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={single_input: input_value})
-      state_static_v = sess.run(state_static,
-                                feed_dict={single_input: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={single_input_using_dim: input_value})
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              single_input: input_value
+          })
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              single_input: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1764,8 +1815,7 @@ class StateSaverRNNTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_state_saving_rnn(
           cell,
@@ -1931,8 +1981,10 @@ class RawRNNTest(test.TestCase):
       (outputs_val, outputs_dynamic_rnn_val, final_state_val,
        final_state_dynamic_rnn_val) = sess.run(
            [outputs, outputs_dynamic_rnn, final_state, final_state_dynamic_rnn],
-           feed_dict={inputs: rand_input,
-                      sequence_length: rand_seq_len})
+           feed_dict={
+               inputs: rand_input,
+               sequence_length: rand_seq_len
+           })
 
       self.assertAllClose(outputs_dynamic_rnn_val, outputs_val)
       self.assertAllClose(final_state_dynamic_rnn_val, final_state_val)
@@ -1945,12 +1997,16 @@ class RawRNNTest(test.TestCase):
         self.assertEqual(len(gradients), len(gradients_dynamic_rnn))
         gradients_val = sess.run(
             gradients,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         gradients_dynamic_rnn_val = sess.run(
             gradients_dynamic_rnn,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         self.assertEqual(len(gradients_val), len(gradients_dynamic_rnn_val))
         input_gradients_val = gradients_val[0]
         input_gradients_dynamic_rnn_val = gradients_dynamic_rnn_val[0]
@@ -2067,14 +2123,13 @@ class RawRNNTest(test.TestCase):
 
       def loop_fn(time_, cell_output, cell_state, _):
         if cell_output is None:
-          emit_output = (array_ops.zeros(
-              [2, 3], dtype=dtypes.int32), array_ops.zeros(
-                  [unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.zeros([2, 3], dtype=dtypes.int32),
+                         array_ops.zeros([unknown_dim], dtype=dtypes.int64))
           next_state = cell.zero_state(batch_size, dtypes.float32)
         else:
-          emit_output = (array_ops.ones(
-              [batch_size, 2, 3], dtype=dtypes.int32), array_ops.ones(
-                  [batch_size, unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.ones([batch_size, 2, 3], dtype=dtypes.int32),
+                         array_ops.ones(
+                             [batch_size, unknown_dim], dtype=dtypes.int64))
           next_state = cell_state
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
@@ -2193,8 +2248,8 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     cell = rnn_cell.LSTMCell(num_units, use_peepholes=True)
     gpu_cell = DeviceWrapperCell(cell, cell_device)
-    inputs = np.random.randn(batch_size, time_steps,
-                             input_size).astype(np.float32)
+    inputs = np.random.randn(batch_size, time_steps, input_size).astype(
+        np.float32)
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
     if input_device is not None:
@@ -2262,8 +2317,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device="/cpu:0",
-        input_device=gpu_dev)
+        rnn_device="/cpu:0", cell_device="/cpu:0", input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
@@ -2278,8 +2332,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
       return  # Test requires access to a GPU
 
     gpu_dev = test.gpu_device_name()
-    run_metadata = self._execute_rnn_on(
-        input_device=gpu_dev)
+    run_metadata = self._execute_rnn_on(input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 8a3894ef9d7..7b883ebc5d7 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1545,97 +1545,6 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
         ]))
 
 
-class WeightNormLSTMCellTest(test.TestCase):
-  """Compared cell output with pre-calculated values."""
-
-  def _cell_output(self, cell):
-    """Calculate cell output"""
-
-    with self.test_session() as sess:
-      init = init_ops.constant_initializer(0.5)
-      with variable_scope.variable_scope("root", initializer=init):
-        x = array_ops.zeros([1, 2])
-        c0 = array_ops.zeros([1, 2])
-        h0 = array_ops.zeros([1, 2])
-
-        state0 = rnn_cell.LSTMStateTuple(c0, h0)
-
-        xout, sout = cell()(x, state0)
-
-      sess.run([variables.global_variables_initializer()])
-      res = sess.run(
-          [xout, sout], {
-              x.name: np.array([[1., 1.]]),
-              c0.name: 0.1 * np.asarray([[0, 1]]),
-              h0.name: 0.1 * np.asarray([[2, 3]]),
-          })
-
-    actual_state_c = res[1].c
-    actual_state_h = res[1].h
-
-    return actual_state_c, actual_state_h
-
-  def testBasicCell(self):
-    """Tests cell w/o peepholes and w/o normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=False)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.65937078, 0.74983585]])
-    expected_h = np.array([[0.44923624, 0.49362513]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testNonbasicCell(self):
-    """Tests cell with peepholes and w/o normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=False, use_peepholes=True)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.65937084, 0.7574988]])
-    expected_h = np.array([[0.4792085, 0.53470564]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testBasicCellWithNorm(self):
-    """Tests cell w/o peepholes and with normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=False)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.50125383, 0.58805949]])
-    expected_h = np.array([[0.32770363, 0.37397948]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-  def testNonBasicCellWithNorm(self):
-    """Tests cell with peepholes and with normalisation"""
-
-    def cell():
-      return contrib_rnn_cell.WeightNormLSTMCell(
-          2, norm=True, use_peepholes=True)
-
-    actual_c, actual_h = self._cell_output(cell)
-
-    expected_c = np.array([[0.50125383, 0.59587258]])
-    expected_h = np.array([[0.35041603, 0.40873795]])
-
-    self.assertAllClose(expected_c, actual_c, 1e-5)
-    self.assertAllClose(expected_h, actual_h, 1e-5)
-
-
 class WeightNormLSTMCellTest(test.TestCase):
   """Compared cell output with pre-calculated values."""
 
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 8adf5dce6ec..5fee2e93e4e 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2729,25 +2729,9 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
 
     input_depth = inputs_shape[1].value
 
-    # Here the contributor believes that the following constraints
-    # are implied. The reasoning is explained here with reference to
-    # the paper https://arxiv.org/pdf/1709.02755.pdf upon which this
-    # implementation is based.
-    # In section 2.1 Equation 5, specifically:
-    # h_t = r_t \odot g(c_t) + (1 - r_t) \odot x_t
-    # the pointwise operation between r_t and x_t means they have
-    # the same shape (since we are implementing an RNN cell, braodcasting
-    # does not happen to input of a single timestep); by the same
-    # reasons, x_t has the same shape as h_t, essentially mandating that
-    # input_depth = unit_num.
-    if input_depth != self._num_units:
-      raise ValueError("SRU requires input_depth == num_units, got "
-                       "input_depth = %s, num_units = %s" % (input_depth,
-                                                             self._num_units))
-
     self._kernel = self.add_variable(
         rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-        shape=[input_depth, 3 * self._num_units])
+        shape=[input_depth, 4 * self._num_units])
 
     self._bias = self.add_variable(
         rnn_cell_impl._BIAS_VARIABLE_NAME,
@@ -2760,8 +2744,8 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     """Simple recurrent unit (SRU) with num_units cells."""
 
     U = math_ops.matmul(inputs, self._kernel)
-    x_bar, f_intermediate, r_intermediate = array_ops.split(
-        value=U, num_or_size_splits=3, axis=1)
+    x_bar, f_intermediate, r_intermediate, x_tx = array_ops.split(
+        value=U, num_or_size_splits=4, axis=1)
 
     f_r = math_ops.sigmoid(
         nn_ops.bias_add(
@@ -2769,7 +2753,7 @@ class SRUCell(rnn_cell_impl._LayerRNNCell):
     f, r = array_ops.split(value=f_r, num_or_size_splits=2, axis=1)
 
     c = f * state + (1.0 - f) * x_bar
-    h = r * self._activation(c) + (1.0 - r) * inputs
+    h = r * self._activation(c) + (1.0 - r) * x_tx
 
     return h, c
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 95dea312f3a..0a53fd66dbe 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -331,7 +331,7 @@ def _luong_score(query, keys, scale):
   # batched matmul on:
   #   [batch_size, 1, depth] . [batch_size, depth, max_time]
   # resulting in an output shape of:
-  #   [batch_time, 1, max_time].
+  #   [batch_size, 1, max_time].
   # we then squeeze out the center singleton dimension.
   score = math_ops.matmul(query, keys, transpose_b=True)
   score = array_ops.squeeze(score, [1])
@@ -924,8 +924,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(LuongMonotonicAttention, self).__init__(
-        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False, dtype=dtype),
+        query_layer=None,
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 6d8f786223f..3245cc5e721 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -72,14 +72,6 @@ class Helper(object):
     """
     raise NotImplementedError("batch_size has not been implemented")
 
-  @abc.abstractproperty
-  def input_shape(self):
-    """Shape of each input element in batch.
-
-    Returns a `TensorShape`.
-    """
-    raise NotImplementedError("input_shape has not been implemented")
-
   @abc.abstractproperty
   def sample_ids_shape(self):
     """Shape of tensor returned by `sample`, excluding the batch dimension.
@@ -135,7 +127,6 @@ class CustomHelper(Helper):
     self._sample_fn = sample_fn
     self._next_inputs_fn = next_inputs_fn
     self._batch_size = None
-    self._input_shape = None
     self._sample_ids_shape = tensor_shape.TensorShape(sample_ids_shape or [])
     self._sample_ids_dtype = sample_ids_dtype or dtypes.int32
 
@@ -158,8 +149,6 @@ class CustomHelper(Helper):
       (finished, next_inputs) = self._initialize_fn()
       if self._batch_size is None:
         self._batch_size = array_ops.size(finished)
-      if self._input_shape is None:
-        self._input_shape = next_inputs.shape[1:]
     return (finished, next_inputs)
 
   def sample(self, time, outputs, state, name=None):
@@ -211,23 +200,6 @@ class TrainingHelper(Helper):
           lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
 
       self._batch_size = array_ops.size(sequence_length)
-      self._input_shape = inputs.shape[2:]
-
-  @property
-  def batch_size(self):
-    return self._batch_size
-
-  @property
-  def input_shape(self):
-    return self._input_shape
-
-  @property
-  def sample_ids_shape(self):
-    return tensor_shape.TensorShape([])
-
-  @property
-  def sample_ids_dtype(self):
-    return dtypes.int32
 
   @property
   def inputs(self):
@@ -237,6 +209,18 @@ class TrainingHelper(Helper):
   def sequence_length(self):
     return self._sequence_length
 
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return tensor_shape.TensorShape([])
+
+  @property
+  def sample_ids_dtype(self):
+    return dtypes.int32
+
   def initialize(self, name=None):
     with ops.name_scope(name, "TrainingHelperInitialize"):
       finished = math_ops.equal(0, self._sequence_length)
@@ -541,16 +525,11 @@ class GreedyEmbeddingHelper(Helper):
     if self._end_token.get_shape().ndims != 0:
       raise ValueError("end_token must be a scalar")
     self._start_inputs = self._embedding_fn(self._start_tokens)
-    self._input_shape = self._start_inputs.shape[1:]
 
   @property
   def batch_size(self):
     return self._batch_size
 
-  @property
-  def input_shape(self):
-    return self._input_shape
-
   @property
   def sample_ids_shape(self):
     return tensor_shape.TensorShape([])
@@ -662,8 +641,6 @@ class InferenceHelper(Helper):
     self._sample_dtype = sample_dtype
     self._next_inputs_fn = next_inputs_fn
     self._batch_size = array_ops.shape(start_inputs)[0]
-    self._input_shape = start_inputs.shape[1:]
-
     self._start_inputs = ops.convert_to_tensor(
         start_inputs, name="start_inputs")
 
@@ -671,10 +648,6 @@ class InferenceHelper(Helper):
   def batch_size(self):
     return self._batch_size
 
-  @property
-  def input_shape(self):
-    return self._input_shape
-
   @property
   def sample_ids_shape(self):
     return self._sample_shape
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py
index 3149875e41f..69db594f8ae 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.py
+++ b/tensorflow/contrib/session_bundle/bundle_shim.py
@@ -82,7 +82,8 @@ def _convert_default_signature_to_signature_def(signatures):
   """
   default_signature = signatures.default_signature
   signature_def = meta_graph_pb2.SignatureDef()
-  if default_signature.WhichOneof("type") == legacy_constants.REGRESSION_SIGNATURE:
+  if (default_signature.WhichOneof("type") ==
+      legacy_constants.REGRESSION_SIGNATURE):
     regression_signature = default_signature.regression_signature
     signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
     _add_input_to_signature_def(regression_signature.input.tensor_name,
@@ -91,7 +92,8 @@ def _convert_default_signature_to_signature_def(signatures):
     _add_output_to_signature_def(regression_signature.output.tensor_name,
                                  signature_constants.REGRESS_OUTPUTS,
                                  signature_def)
-  elif default_signature.WhichOneof("type") == legacy_constants.CLASSIFICATION_SIGNATURE:
+  elif (default_signature.WhichOneof("type") ==
+        legacy_constants.CLASSIFICATION_SIGNATURE):
     classification_signature = default_signature.classification_signature
     signature_def.method_name = signature_constants.CLASSIFY_METHOD_NAME
     _add_input_to_signature_def(classification_signature.input.tensor_name,
@@ -132,8 +134,10 @@ def _convert_named_signatures_to_signature_def(signatures):
       signature_constants.PREDICT_OUTPUTS]
   # TODO(pdudnik): what if there are other signatures? Mimic cr/140900781 once
   # it is submitted.
-  if (input_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE or
-      output_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE):
+  if (input_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE or
+      output_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE):
     raise RuntimeError("Named input and output signatures can only be "
                        "up-converted if they are generic signature. "
                        "Input signature type is %s, output signature type is "
diff --git a/tensorflow/contrib/session_bundle/bundle_shim_test.cc b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
index 72f32a0f555..9a1dd9303f4 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim_test.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
@@ -493,17 +493,15 @@ TEST(BundleShimTest, DefaultAndNamedSignatureWithPredict) {
   ASSERT_FALSE(
       actual_signature_def_predict->second.inputs().find("foo-input") ==
       actual_signature_def_predict->second.inputs().end());
-  EXPECT_EQ("foo-input",
-            actual_signature_def_predict->second.inputs()
-                .find("foo-input")
-                ->second.name());
+  EXPECT_EQ("foo-input", actual_signature_def_predict->second.inputs()
+                             .find("foo-input")
+                             ->second.name());
   ASSERT_FALSE(
       actual_signature_def_predict->second.outputs().find("foo-output") ==
       actual_signature_def_predict->second.outputs().end());
-  EXPECT_EQ("foo-output",
-            actual_signature_def_predict->second.outputs()
-                .find("foo-output")
-                ->second.name());
+  EXPECT_EQ("foo-output", actual_signature_def_predict->second.outputs()
+                              .find("foo-output")
+                              ->second.name());
   EXPECT_EQ(kPredictMethodName,
             actual_signature_def_predict->second.method_name());
 }
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index f6f663aae76..08983337fcc 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -281,11 +281,12 @@ class Exporter(object):
     tmp_export_dir = compat.as_text(export_dir) + "-tmp"
     gfile.MakeDirs(tmp_export_dir)
 
-    self._saver.save(sess,
-                     os.path.join(
-                         compat.as_text(tmp_export_dir),
-                         compat.as_text(constants.EXPORT_BASE_NAME)),
-                     meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
+    self._saver.save(
+        sess,
+        os.path.join(
+            compat.as_text(tmp_export_dir),
+            compat.as_text(constants.EXPORT_BASE_NAME)),
+        meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
 
     # Run the asset callback.
     if self._assets_callback and self._assets_to_copy:
@@ -301,12 +302,12 @@ class Exporter(object):
     if exports_to_keep:
       # create a simple parser that pulls the export_version from the directory.
       def parser(path):
-        if os.name == 'nt':
-          match = re.match("^" + export_dir_base.replace('\\','/') + "/(\\d{8})$",
-                           path.path.replace('\\','/'))
+        if os.name == "nt":
+          match = re.match(
+              "^" + export_dir_base.replace("\\", "/") + "/(\\d{8})$",
+              path.path.replace("\\", "/"))
         else:
-          match = re.match("^" + export_dir_base + "/(\\d{8})$",
-                           path.path)
+          match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path)
         if not match:
           return None
         return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/session_bundle/signature.cc b/tensorflow/contrib/session_bundle/signature.cc
index 7133875ad53..ed70a5b91b2 100644
--- a/tensorflow/contrib/session_bundle/signature.cc
+++ b/tensorflow/contrib/session_bundle/signature.cc
@@ -38,9 +38,9 @@ namespace {
 Status BatchSizesMatch(const Tensor& input, const Tensor& output) {
   // Ensure the number of outputs match the number of inputs.
   if (input.dim_size(0) != output.dim_size(0)) {
-    return errors::Internal(
-        strings::StrCat("Input batch size did not match output batch size: ",
-                        input.dim_size(0), " vs. ", output.dim_size(0)));
+    return errors::Internal(strings::StrCat(
+        "Input batch size did not match output batch size: ", input.dim_size(0),
+        " vs. ", output.dim_size(0)));
   }
   return Status::OK();
 }
@@ -100,8 +100,8 @@ Status GetNamedClassificationSignature(
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   if (!it->second.has_classification_signature()) {
     return errors::FailedPrecondition(
@@ -232,8 +232,8 @@ Status GetNamedSignature(const string& name,
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   *signature = it->second;
   return Status::OK();
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 4e816f9b11b..831c6e427ae 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -197,9 +197,7 @@ class MultiplyGradientsTest(test.TestCase):
     gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
     variable = variables_lib.Variable(array_ops.zeros_like(gradient))
     multiplier_flag = variables_lib.Variable(True)
-    tensor_multiplier = array_ops.where(multiplier_flag,
-                                        self._multiplier,
-                                        1.0)
+    tensor_multiplier = array_ops.where(multiplier_flag, self._multiplier, 1.0)
     grad_to_var = (gradient, variable)
     gradient_multipliers = {variable: tensor_multiplier}
 
@@ -212,11 +210,8 @@ class MultiplyGradientsTest(test.TestCase):
       sess.run(multiplier_flag.assign(False))
       gradient_false_flag = sess.run(grad_to_var[0])
     np_testing.assert_almost_equal(gradient_true_flag,
-                                   self._multiplied_grad_vec,
-                                   5)
-    np_testing.assert_almost_equal(gradient_false_flag,
-                                   self._grad_vec,
-                                   5)
+                                   self._multiplied_grad_vec, 5)
+    np_testing.assert_almost_equal(gradient_false_flag, self._grad_vec, 5)
 
 
 def LogisticClassifier(inputs):
@@ -502,6 +497,7 @@ class TrainTest(test.TestCase):
     purpose.
     """
     dump_root = tempfile.mkdtemp()
+
     def dumping_wrapper(sess):  # pylint: disable=invalid-name
       return dumping_wrapper_lib.DumpingDebugWrapperSession(sess, dump_root)
 
@@ -519,16 +515,13 @@ class TrainTest(test.TestCase):
       train_op = learning.create_train_op(total_loss, optimizer)
 
       loss = learning.train(
-          train_op,
-          None,
-          number_of_steps=1,
-          session_wrapper=dumping_wrapper)
+          train_op, None, number_of_steps=1, session_wrapper=dumping_wrapper)
     self.assertIsNotNone(loss)
 
     run_root = glob.glob(os.path.join(dump_root, 'run_*'))[-1]
     dump = debug_data.DebugDumpDir(run_root)
-    self.assertAllEqual(
-        0, dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
+    self.assertAllEqual(0,
+                        dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
 
   def testTrainWithTrace(self):
     logdir = os.path.join(
@@ -961,8 +954,8 @@ class TrainTest(test.TestCase):
     self.assertGreater(losses[0], losses[1])
 
   def testTrainWithEpochLimit(self):
-    logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()),
-                          'tmp_logs')
+    logdir = os.path.join(
+        tempfile.mkdtemp(prefix=self.get_temp_dir()), 'tmp_logs')
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
@@ -982,7 +975,8 @@ class TrainTest(test.TestCase):
     self.assertIsNotNone(loss)
     self.assertLess(loss, .015)
     self.assertTrue(os.path.isfile('{}/model.ckpt-300.index'.format(logdir)))
-    self.assertTrue(os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
+    self.assertTrue(
+        os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index ee661dfdc11..a6968d8b2a6 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -202,7 +202,7 @@ def create_file_writer(logdir,
     if flush_millis is None:
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
-      filename_suffix = constant_op.constant("")
+      filename_suffix = constant_op.constant(".v2")
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index a998ac1e111..4abcc20ed33 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
@@ -43,8 +43,8 @@ from tensorflow.python.training import training_util
 KEYS_NAME = 'keys'
 LOSS_NAME = 'rf_training_loss'
 TREE_PATHS_PREDICTION_KEY = 'tree_paths'
-VARIANCE_PREDICTION_KEY = 'regression_variance'
-
+VARIANCE_PREDICTION_KEY = 'prediction_variance'
+ALL_SERVING_KEY = 'tensorforest_all'
 EPSILON = 0.000001
 
 
@@ -134,7 +134,8 @@ def get_model_fn(params,
                  trainer_id=0,
                  report_feature_importances=False,
                  local_eval=False,
-                 head_scope=None):
+                 head_scope=None,
+                 include_all_in_serving=False):
   """Return a model function given a way to construct a graph builder."""
   if model_head is None:
     model_head = get_default_head(params, weights_name)
@@ -238,7 +239,13 @@ def get_model_fn(params,
       model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
 
     model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
-
+    if include_all_in_serving:
+      # In order to serve the variance we need to add the prediction dict
+      # to output_alternatives dict.
+      if not model_ops.output_alternatives:
+        model_ops.output_alternatives = {}
+      model_ops.output_alternatives[ALL_SERVING_KEY] = (
+          constants.ProblemType.UNSPECIFIED, model_ops.predictions)
     return model_ops
 
   return _model_fn
@@ -293,7 +300,8 @@ class TensorForestEstimator(estimator.Estimator):
                report_feature_importances=False,
                local_eval=False,
                version=None,
-               head=None):
+               head=None,
+               include_all_in_serving=False):
     """Initializes a TensorForestEstimator instance.
 
     Args:
@@ -339,6 +347,23 @@ class TensorForestEstimator(estimator.Estimator):
       version: Unused.
       head: A heads_lib.Head object that calculates losses and such. If None,
         one will be automatically created based on params.
+      include_all_in_serving: if True, allow preparation of the complete
+        prediction dict including the variance to be exported for serving with
+        the Servo lib; and it also requires calling export_savedmodel with
+        default_output_alternative_key=ALL_SERVING_KEY, i.e.
+        estimator.export_savedmodel(export_dir_base=your_export_dir,
+          serving_input_fn=your_export_input_fn,
+          default_output_alternative_key=ALL_SERVING_KEY)
+        if False, resort to default behavior, i.e. export scores and
+          probabilities but no variances. In this case
+          default_output_alternative_key should be None while calling
+          export_savedmodel().
+        Note, that due to backward compatibility we cannot always set
+        include_all_in_serving to True because in this case calling
+        export_saved_model() without
+        default_output_alternative_key=ALL_SERVING_KEY (legacy behavior) the
+        saved_model_export_utils.get_output_alternatives() would raise
+        ValueError.
 
     Returns:
       A `TensorForestEstimator` instance.
@@ -357,7 +382,9 @@ class TensorForestEstimator(estimator.Estimator):
             num_trainers=num_trainers,
             trainer_id=trainer_id,
             report_feature_importances=report_feature_importances,
-            local_eval=local_eval),
+            local_eval=local_eval,
+            include_all_in_serving=include_all_in_serving,
+        ),
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index 76cfb4c9ca0..cf0db788a41 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -99,18 +99,17 @@ class HardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -125,9 +124,8 @@ class HardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -144,12 +142,11 @@ class HardRoutingFunction : public OpKernel {
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
         float dot_product = 0.0;
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
index 28f50f1a32e..f64155fa55a 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
@@ -85,12 +85,9 @@ REGISTER_OP("KFeatureGradient")
 
 class KFeatureGradient : public OpKernel {
  public:
-  explicit KFeatureGradient(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+  explicit KFeatureGradient(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -101,14 +98,14 @@ class KFeatureGradient : public OpKernel {
     const Tensor& routing_tensor = context->input(3);
 
     // Extract dimensions from input tensors.
-    const int32 num_data = static_cast<int32>(
-        input_data_tensor.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data_tensor.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
-    const int32 num_features_per_node = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(1));
+    const int32 num_data =
+        static_cast<int32>(input_data_tensor.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data_tensor.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_features_per_node =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(1));
 
     // Construct output tensors.
     Tensor* out_routes = nullptr;
@@ -127,12 +124,12 @@ class KFeatureGradient : public OpKernel {
     out_weights_shape.AddDim(num_nodes);
     out_weights_shape.AddDim(num_features_per_node);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, out_routes_shape, &out_routes));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, out_data_shape, &out_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, out_weights_shape, &out_weights));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, out_routes_shape, &out_routes));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, out_data_shape, &out_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(2, out_weights_shape, &out_weights));
 
     tensorforest::Initialize(*out_data, 0.0f);
 
@@ -148,18 +145,13 @@ class KFeatureGradient : public OpKernel {
 
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data_tensor.Slice(i, i+1);
+      const Tensor point = input_data_tensor.Slice(i, i + 1);
       feature_set.clear();
 
       // Traverse the tree from the bottom up.
       for (int j = num_nodes - 1; j >= 0; j--) {
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            j,
-            random_seed_,
-            num_features,
-            num_features_per_node,
-            &feature_set);
+        tensorforest::GetFeatureSet(layer_num_, j, random_seed_, num_features,
+                                    num_features_per_node, &feature_set);
 
         // Compute routing gradient.
         // j is a leaf node.
@@ -170,12 +162,8 @@ class KFeatureGradient : public OpKernel {
           int32 right_child = left_child + 1;
 
           float left_prob = LeftProbabilityK(
-              point,
-              feature_set,
-              tree_parameters_tensor.Slice(j, j+1),
-              tree_biases(j),
-              num_features,
-              num_features_per_node);
+              point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+              tree_biases(j), num_features, num_features_per_node);
 
           float right_prob = 1.0f - left_prob;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
index 9bc42eb61fa..e7cafb144da 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
@@ -43,7 +43,6 @@ using shape_inference::ShapeHandle;
 using tensorforest::CheckTensorBounds;
 using tensorforest::LeftProbabilityK;
 
-
 // The term 'routing function' is synonymous with 'the probability
 // that an instance is routed to each leaf node.'  It is defined in
 // 'Deep Neural Decision Forests' by Kontschieder et al.
@@ -96,10 +95,8 @@ class KFeatureRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("max_nodes", &max_nodes_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features_per_node",
                                              &num_features_per_node_));
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -108,27 +105,25 @@ class KFeatureRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
@@ -136,30 +131,22 @@ class KFeatureRoutingFunction : public OpKernel {
     // Iteratively compute the probability of reaching each leaf.
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0f;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
         feature_set.clear();
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            i,
-            random_seed_,
-            num_features,
-            num_features_per_node_,
-            &feature_set);
+        tensorforest::GetFeatureSet(layer_num_, i, random_seed_, num_features,
+                                    num_features_per_node_, &feature_set);
 
-        int32 left_child = 2*j + 1;
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbabilityK(point,
-                                           feature_set,
-                                           tree_parameters_tensor.Slice(j, j+1),
-                                           tree_biases(j),
-                                           num_features,
-                                           num_features_per_node_);
+        float left_prob = LeftProbabilityK(
+            point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+            tree_biases(j), num_features, num_features_per_node_);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0f - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
index 4027e732b3f..0c2eaabe8f3 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
@@ -90,46 +90,43 @@ class RoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
 
     // Iteratively compute the probability of reaching each leaf.
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
-        int32 left_child = 2*j + 1;
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0 - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index 66aa293dc1c..c9df09bfda4 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -96,10 +96,9 @@ class StochasticHardRoutingFunction : public OpKernel {
   explicit StochasticHardRoutingFunction(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tree_depth", &tree_depth_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
     single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(random_seed_));
+        new random::PhiloxRandom(random_seed_));
     rng_ = std::unique_ptr<random::SimplePhilox>(
         new random::SimplePhilox(single_rand_.get()));
   }
@@ -111,20 +110,19 @@ class StochasticHardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -139,9 +137,8 @@ class StochasticHardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -150,19 +147,18 @@ class StochasticHardRoutingFunction : public OpKernel {
     // Stochastically traverse the tree to a leaf.
 
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       int32 node = 0;
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         if (left_prob < rng_->RandFloat()) {
           CHECK_LT(i, num_data);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index 0b5afe464f4..b0d8b832b54 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -149,14 +149,14 @@ class StochasticHardRoutingGradient : public OpKernel {
     TensorShape output_bias_shape;
     output_bias_shape.AddDim(num_data);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, output_routing_shape, &output_routing));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, output_data_shape, &output_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, output_parameters_shape, &output_parameters));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        3, output_bias_shape, &output_bias));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_routing_shape,
+                                                     &output_routing));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_data_shape, &output_data));
+    OP_REQUIRES_OK(context, context->allocate_output(2, output_parameters_shape,
+                                                     &output_parameters));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(3, output_bias_shape, &output_bias));
 
     tensorforest::Initialize(*output_routing, 0.0);
     tensorforest::Initialize(*output_data, 0.0);
@@ -178,7 +178,7 @@ class StochasticHardRoutingGradient : public OpKernel {
       const Tensor point = input_data.Slice(i, i + 1);
 
       // Traverses the tree from the bottom up.
-      for (int j = tree_depth_-1; j > -1; j--) {
+      for (int j = tree_depth_ - 1; j > -1; j--) {
         int32 node = path(i, j);
 
         CHECK_LT(node, num_nodes);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
index cacad03e274..25825a78a14 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
@@ -64,8 +64,7 @@ REGISTER_OP("UnpackPath")
 
 class UnpackPath : public OpKernel {
  public:
-  explicit UnpackPath(OpKernelConstruction* context)
-      : OpKernel(context) {}
+  explicit UnpackPath(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     VLOG(1) << "unpack start";
@@ -73,8 +72,8 @@ class UnpackPath : public OpKernel {
     const Tensor& path_values_tensor = context->input(1);
 
     const int32 num_data = static_cast<int32>(path_tensor.shape().dim_size(0));
-    const int32 tree_depth = static_cast<int32>(
-        path_tensor.shape().dim_size(1));
+    const int32 tree_depth =
+        static_cast<int32>(path_tensor.shape().dim_size(1));
 
     const int32 num_nodes = MathUtil::IPow(2, tree_depth) - 1;
 
@@ -107,7 +106,6 @@ class UnpackPath : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU),
-                        UnpackPath);
+REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU), UnpackPath);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
index c091a73c4e4..34388fe1aab 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
@@ -25,9 +25,7 @@ namespace tensorforest {
 
 using tensorflow::Tensor;
 
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
@@ -41,11 +39,8 @@ float LeftProbability(const Tensor& point,
   return 1.0 / (1.0 + exp(-dot_product + bias));
 }
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
index c5902184f95..69a0143a4e3 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
@@ -24,16 +24,11 @@ namespace tensorflow {
 namespace tensorforest {
 
 // Returns the probability that the point falls to the left.
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features);
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k);
 
 // Returns a random set of num_features_to_pick features in the
@@ -49,5 +44,3 @@ void GetFeatureSet(int32 tree_num, int32 node_num, int32 random_seed,
 }  // namespace tensorflow
 
 #endif  // LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
index 47b49a379c4..b21a9179777 100644
--- a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
@@ -30,15 +30,13 @@ namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
 
-
 float Convert(const string& in) {
   const std::size_t intval = std::hash<string>()(in);
   return static_cast<float>(intval);
 }
 
-
-void Evaluate(const Tensor& input_data, Tensor output_data,
-              int32 start, int32 end) {
+void Evaluate(const Tensor& input_data, Tensor output_data, int32 start,
+              int32 end) {
   auto out_data = output_data.unaligned_flat<float>();
   const auto in_data = input_data.unaligned_flat<string>();
 
@@ -59,9 +57,8 @@ class ReinterpretStringToFloat : public OpKernel {
     if (!CheckTensorBounds(context, input_data)) return;
 
     Tensor* output_data = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_data.shape(),
-                                            &output_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, input_data.shape(), &output_data));
 
     // Evaluate input data in parallel.
     const int32 num_data = static_cast<int32>(input_data.NumElements());
@@ -73,8 +70,8 @@ class ReinterpretStringToFloat : public OpKernel {
       auto work = [&input_data, output_data, num_data](int64 start, int64 end) {
         CHECK(start <= end);
         CHECK(end <= num_data);
-        Evaluate(input_data, *output_data,
-                 static_cast<int32>(start), static_cast<int32>(end));
+        Evaluate(input_data, *output_data, static_cast<int32>(start),
+                 static_cast<int32>(end));
       };
       Shard(num_threads, worker_threads->workers, num_data, 100, work);
     }
diff --git a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
index dd2a98b08cd..60740c2be37 100644
--- a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
@@ -22,7 +22,6 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/platform/logging.h"
 
-
 namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
@@ -38,20 +37,19 @@ class ScatterAddNdim : public OpKernel {
 
     if (indices_tensor.shape().dim_size(0) > 0) {
       OP_REQUIRES(context, indices_tensor.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "indices should be two-dimensional"));
+                  errors::InvalidArgument("indices should be two-dimensional"));
       const int32 delta_dims = deltas_tensor.shape().dims();
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(1) + delta_dims ==
-          input_tensor.shape().dims() + 1,
+              input_tensor.shape().dims() + 1,
           errors::InvalidArgument(
               "Number of indices dimensions should be the same as input "
               "rank."));
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(0) ==
-          deltas_tensor.shape().dim_size(0),
+              deltas_tensor.shape().dim_size(0),
           errors::InvalidArgument(
               "Number of updates should be same as number of indices."));
     } else {
@@ -68,8 +66,8 @@ class ScatterAddNdim : public OpKernel {
     const auto indices = indices_tensor.tensor<int32, 2>();
     const auto deltas = deltas_tensor.unaligned_flat<float>();
 
-    const int32 num_dims = static_cast<int32>(
-        indices_tensor.shape().dim_size(1));
+    const int32 num_dims =
+        static_cast<int32>(indices_tensor.shape().dim_size(1));
 
     // Figure out if indices don't specify a complete position in the
     // input tensor.
@@ -80,10 +78,9 @@ class ScatterAddNdim : public OpKernel {
 
     // Calculate index multipliers.
     std::vector<int32> multipliers;
-    OP_REQUIRES(
-        context, input.size() < std::numeric_limits<int32>::max(),
-        errors::InvalidArgument(
-            "Input must contain less than 2^31 total elements"));
+    OP_REQUIRES(context, input.size() < std::numeric_limits<int32>::max(),
+                errors::InvalidArgument(
+                    "Input must contain less than 2^31 total elements"));
     int32 last_size = static_cast<int32>(input.size());
 
     for (int32 j = 0; j < num_dims; j++) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 94e12cea5a0..44997ec5d6d 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -65,8 +65,8 @@ void GetTwoBest(int max, const std::function<float(int)>& score_fn,
 
 float ClassificationSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits,
-    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights,
-    int32 num_classes, int i) {
+    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights, int32 num_classes,
+    int i) {
   Eigen::array<int, 1> offsets;
   // Class counts are stored with the total in [0], so the length of each
   // count vector is num_classes + 1.
@@ -74,7 +74,7 @@ float ClassificationSplitScore(
   Eigen::array<int, 1> extents;
   extents[0] = num_classes;
   return WeightedGiniImpurity(splits.slice(offsets, extents)) +
-      WeightedGiniImpurity(rights.slice(offsets, extents));
+         WeightedGiniImpurity(rights.slice(offsets, extents));
 }
 
 void GetTwoBestClassification(const Tensor& total_counts,
@@ -90,29 +90,28 @@ void GetTwoBestClassification(const Tensor& total_counts,
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc = total_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc =
+      total_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
 
   // TODO(gilberth): See if we can delay evaluation here by templating the
   // arguments to ClassificationSplitScore.
-  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits = split_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits =
+      split_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   Eigen::array<int, 1> bcast;
   bcast[0] = num_splits;
   const Eigen::Tensor<float, 1, Eigen::RowMajor> rights =
       tc.broadcast(bcast) - splits;
 
-  std::function<float(int)> score_fn = std::bind(
-      ClassificationSplitScore, splits, rights, num_classes,
-      std::placeholders::_1);
+  std::function<float(int)> score_fn =
+      std::bind(ClassificationSplitScore, splits, rights, num_classes,
+                std::placeholders::_1);
 
   GetTwoBest(num_splits, score_fn, best_score, best_index, second_best_score,
              second_best_index);
 }
 
-int32 BestFeatureClassification(
-    const Tensor& total_counts, const Tensor& split_counts,
-    int32 accumulator) {
+int32 BestFeatureClassification(const Tensor& total_counts,
+                                const Tensor& split_counts, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -130,8 +129,7 @@ float RegressionSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits_square,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_sums,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_squares,
-    int32 accumulator,
-    int32 num_regression_dims, int i) {
+    int32 accumulator, int32 num_regression_dims, int i) {
   Eigen::array<int, 1> offsets = {i * num_regression_dims + 1};
   Eigen::array<int, 1> extents = {num_regression_dims - 1};
   float left_count = splits_count_accessor(accumulator, i, 0);
@@ -141,15 +139,15 @@ float RegressionSplitScore(
 
   // Guard against divide-by-zero.
   if (left_count > 0) {
-    score += WeightedVariance(
-        splits_sum.slice(offsets, extents),
-        splits_square.slice(offsets, extents), left_count);
+    score +=
+        WeightedVariance(splits_sum.slice(offsets, extents),
+                         splits_square.slice(offsets, extents), left_count);
   }
 
   if (right_count > 0) {
-    score += WeightedVariance(right_sums.slice(offsets, extents),
-                              right_squares.slice(offsets, extents),
-                              right_count);
+    score +=
+        WeightedVariance(right_sums.slice(offsets, extents),
+                         right_squares.slice(offsets, extents), right_count);
   }
   return score;
 }
@@ -159,20 +157,20 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
                           int32 accumulator, float* best_score, int* best_index,
                           float* second_best_score, int* second_best_index) {
   const int32 num_splits = static_cast<int32>(split_sums.shape().dim_size(1));
-  const int32 num_regression_dims = static_cast<int32>(
-      split_sums.shape().dim_size(2));
+  const int32 num_regression_dims =
+      static_cast<int32>(split_sums.shape().dim_size(2));
   // Ideally, Eigen::Tensor::chip would be best to use here but it results
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc_sum = total_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto tc_square = total_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_sum = split_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_square = split_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_sum =
+      total_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_square =
+      total_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_sum =
+      split_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_square =
+      split_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   // Eigen is infuriating to work with, usually resulting in all kinds of
   // unhelpful compiler errors when trying something that seems sane.  This
   // helps us do a simple thing like access the first element (the counts)
@@ -193,10 +191,10 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
              best_score, best_index, second_best_score, second_best_index);
 }
 
-int32 BestFeatureRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+int32 BestFeatureRegression(const Tensor& total_sums,
+                            const Tensor& total_squares,
+                            const Tensor& split_sums,
+                            const Tensor& split_squares, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -207,10 +205,11 @@ int32 BestFeatureRegression(
   return best_feature_index;
 }
 
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator) {
   // TODO(thomaswc): Implement this, probably as part of v3.
   return false;
 }
@@ -599,7 +598,6 @@ bool Decide(float value, float bias, DataColumnTypes type) {
   }
 }
 
-
 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
                            float parent_sum, const float* parent_data,
                            float valid_leaf_threshold, int num_outputs,
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index dad9df48988..edbac670067 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -45,13 +45,10 @@ const int32 LEAF_NODE = -1;
 const int32 FREE_NODE = -2;
 
 // Used to indicate column types, e.g. categorical vs. float
-enum DataColumnTypes {
-  kDataFloat = 0,
-  kDataCategorical = 1
-};
+enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 };
 
 // Calculates the sum of a tensor.
-template<typename T>
+template <typename T>
 T Sum(Tensor counts) {
   Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum =
       counts.unaligned_flat<T>().sum();
@@ -97,7 +94,7 @@ float WeightedGiniImpurity(const T& counts) {
   return RawWeightedGiniImpurity(smoothed);
 }
 
-template<typename T1, typename T2>
+template <typename T1, typename T2>
 float WeightedVariance(const T1& sums, const T2& squares, float count) {
   const auto e_x = sums / count;
   const auto e_x2 = squares / count;
@@ -120,10 +117,11 @@ int32 BestFeatureRegression(const Tensor& total_sums,
 
 // Returns true if the best split's variance is sufficiently smaller than
 // that of the next best split.
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator);
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator);
 
 // Performs booststrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
@@ -178,10 +176,8 @@ bool DecideNode(const GetFeatureFnType& get_dense,
 // isn't present in sparse_input_indices.  sparse_input_indices is assumed
 // to be sorted.
 template <typename T1, typename T2>
-float FindSparseValue(
-    const T1& sparse_input_indices,
-    const T2& sparse_input_values,
-    int32 i, int32 j) {
+float FindSparseValue(const T1& sparse_input_indices,
+                      const T2& sparse_input_values, int32 i, int32 j) {
   int32 low = 0;
   int32 high = sparse_input_values.dimension(0);
   while (low < high) {
@@ -273,7 +269,6 @@ int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
 // categorical data, it is value != bias.
 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat);
 
-
 // Returns true if all the splits are initialized. Since they get initialized
 // in order, we can simply infer this from the last split.
 // This should only be called for a single allocator's candidate features
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
index 7485a695dfb..08553545502 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
@@ -44,11 +44,13 @@ TEST(TestWeightedVariance, Basic) {
   Tensor squares = test::AsTensor<float>({29, 12}, {2});
 
   EXPECT_FLOAT_EQ(WeightedVariance(sums.unaligned_flat<float>(),
-                                   squares.unaligned_flat<float>(), 3), 2.0);
+                                   squares.unaligned_flat<float>(), 3),
+                  2.0);
 
   Tensor zero = test::AsTensor<float>({0}, {1});
   EXPECT_FLOAT_EQ(WeightedVariance(zero.unaligned_flat<float>(),
-                                   zero.unaligned_flat<float>(), 1), 0);
+                                   zero.unaligned_flat<float>(), 1),
+                  0);
 }
 
 TEST(TestInitialize, Basic) {
@@ -94,17 +96,16 @@ TEST(BestFeatureClassification, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         30, 10, 10, 10,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       30, 10, 10, 10, 10, 0, 0, 10, 19, 5, 6, 8,  // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals = test::AsTensor<float>(
+      {1, 5, 6, 7, 0, 0, 0, 0, 30, 10, 10, 10,  // this one
+       -1, -1, -1, -1},
+      {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  30, 10,
+                             10, 10, 10, 0,  0,  10, 19, 5,  6,  8,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 1);
 }
@@ -114,17 +115,16 @@ TEST(BestFeatureClassification, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // When counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         18, 6, 6, 6,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       9, 3, 3, 3, 9, 3, 3, 3, 9, 3, 3, 3,     // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 18, 6, 6, 6,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4, 0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 3,
+                             3,  3,  9,  3,  3,  3,  9,  3,  3,  3,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 0);
 }
@@ -133,36 +133,34 @@ TEST(BestFeatureRegression, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 9, 8, 5, 9, 0, 0, 0, 0,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  9,  8,  5,  9,  0,  0,  0,  0,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   // lower the variance by lowering one of the squares just a little.
-  Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 43, 0, 0, 0, 0,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_squares =
+      test::AsTensor<float>(
+          {1,   2,  3,  4,  1,   2,  3,  4,  1,  2,  3,  4,
+           0,   0,  0,  0,  0,   0,  0,  0,  0,  0,  0,  0,
+           100, 50, 40, 45, 100, 50, 40, 43, 0,  0,  0,  0,  // this one
+           -1,  -1, -1, -1, -1,  -1, -1, -1, -1, -1, -1, -1},
+          {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 1);
+                                  split_squares, 2),
+            1);
 }
 
 TEST(BestFeatureRegression, NoWinner) {
@@ -170,37 +168,33 @@ TEST(BestFeatureRegression, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // when counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 10, 8, 6, 9, 10, 8, 6, 9,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  10, 8,  6,  9,  10, 8,  6,  9,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+      {1,   2,  3,  4,  1,   2,  3,  4,  1,   2,  3,  4,
+       0,   0,  0,  0,  0,   0,  0,  0,  0,   0,  0,  0,
+       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,  // this one
+       -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1},
       {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 0);
+                                  split_squares, 2),
+            0);
 }
 
 }  // namespace tensorforest
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
index 81e2a1b2a1b..f4a7058ddb8 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
@@ -14,8 +14,8 @@
 // =============================================================================
 #include "tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h"
 
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
@@ -58,8 +58,7 @@ CandidateGraphRunner::CandidateGraphRunner(
   // Features don't change, store them in a tensor.
   const auto& oblique = split.inequality_left_child_test().oblique();
   const int32 feat_size = oblique.features_size();
-  features_.reset(
-      new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
+  features_.reset(new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
   auto feat = features_->flat<int32>();
   int i = 0;
   for (const auto& id : oblique.features()) {
@@ -67,10 +66,10 @@ CandidateGraphRunner::CandidateGraphRunner(
   }
 }
 
-void CandidateGraphRunner::RunOp(
-    const string& name, const TensorNameValueList& inputs,
-    const std::vector<string>& output_tensor_names,
-    std::vector<Tensor>* outputs) {
+void CandidateGraphRunner::RunOp(const string& name,
+                                 const TensorNameValueList& inputs,
+                                 const std::vector<string>& output_tensor_names,
+                                 std::vector<Tensor>* outputs) {
   std::vector<string> op_name;
   if (name != kNoOp) {
     op_name.push_back(name);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index cced26b9036..328af28725a 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -26,7 +26,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Keep a tree ensemble in memory for efficient evaluation and mutation.
 class DecisionTreeResource : public ResourceBase {
  public:
@@ -35,15 +34,12 @@ class DecisionTreeResource : public ResourceBase {
 
   string DebugString() override {
     return strings::StrCat("DecisionTree[size=",
-                           decision_tree_->decision_tree().nodes_size(),
-                           "]");
+                           decision_tree_->decision_tree().nodes_size(), "]");
   }
 
   void MaybeInitialize();
 
-  const decision_trees::Model& decision_tree() const {
-    return *decision_tree_;
-  }
+  const decision_trees::Model& decision_tree() const { return *decision_tree_; }
 
   decision_trees::Model* mutable_decision_tree() {
     return decision_tree_.get();
@@ -59,9 +55,7 @@ class DecisionTreeResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-    decision_tree_.reset(new decision_trees::Model());
-  }
+  void Reset() { decision_tree_.reset(new decision_trees::Model()); }
 
   mutex* get_mutex() { return &mu_; }
 
@@ -84,7 +78,6 @@ class DecisionTreeResource : public ResourceBase {
   std::vector<std::unique_ptr<DecisionNodeEvaluator>> node_evaluators_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index 85ce7b825b1..bf2b2aaa3c8 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -22,7 +22,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Base class for evaluators of decision nodes that effectively copy proto
 // contents into C++ structures for faster execution.
 class DecisionNodeEvaluator {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
index 5c49b87443e..af5cf72a3c0 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
@@ -20,11 +20,11 @@
 namespace tensorflow {
 namespace {
 
+using tensorflow::decision_trees::InequalityTest;
+using tensorflow::decision_trees::MatchingValuesTest;
 using tensorflow::tensorforest::InequalityDecisionNodeEvaluator;
 using tensorflow::tensorforest::MatchingValuesDecisionNodeEvaluator;
 using tensorflow::tensorforest::ObliqueInequalityDecisionNodeEvaluator;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::MatchingValuesTest;
 
 TEST(InequalityDecisionNodeEvaluatorTest, TestLessOrEqual) {
   InequalityTest test;
@@ -124,4 +124,3 @@ TEST(ObliqueDecisionNodeEvaluatorTest, Basic) {
 
 }  // namespace
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index 0d6712e9e55..eea0be27caf 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -40,9 +40,7 @@ class FertileStatsResource : public ResourceBase {
     model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
   }
 
-  string DebugString() override {
-    return "FertileStats";
-  }
+  string DebugString() override { return "FertileStats"; }
 
   void ExtractFromProto(const FertileStats& stats);
 
@@ -50,8 +48,7 @@ class FertileStatsResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-  }
+  void Reset() {}
 
   // Reset the stats for a node, but leave the leaf_stats intact.
   void ResetSplitStats(int32 node_id, int32 depth) {
@@ -84,7 +81,6 @@ class FertileStatsResource : public ResourceBase {
   // was found.
   bool BestSplit(int32 node_id, SplitCandidate* best, int32* depth);
 
-
  private:
   mutex mu_;
   std::shared_ptr<LeafModelOperator> model_op_;
@@ -94,7 +90,6 @@ class FertileStatsResource : public ResourceBase {
   void AllocateNode(int32 node_id, int32 depth);
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
index 3ce630e3a96..da600d34eac 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
@@ -20,7 +20,6 @@
 #include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
 
-
 namespace tensorflow {
 namespace tensorforest {
 
@@ -454,14 +453,14 @@ void DenseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     class_stats->add_value()->set_float_value(total_counts_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
                            ->mutable_classification()
                            ->mutable_dense_counts();
     for (int i = 0; i < num_outputs_; ++i) {
-       left_stats->add_value()->set_float_value(left_count(split_num, i));
+      left_stats->add_value()->set_float_value(left_count(split_num, i));
     }
   }
 }
@@ -546,7 +545,7 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     (*class_stats)[entry.first] = val;
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
@@ -561,8 +560,8 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
   }
 }
 
-float SparseClassificationGrowStats::GiniScore(
-    int split, float* left_sum, float* right_sum) const {
+float SparseClassificationGrowStats::GiniScore(int split, float* left_sum,
+                                               float* right_sum) const {
   float left_square = 0, right_square = 0;
   *left_sum = 0;
   *right_sum = 0;
@@ -844,12 +843,11 @@ void LeastSquaresRegressionGrowStats::PackToProto(FertileSlot* slot) const {
     total_squares->add_value()->set_float_value(total_sum_squares_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
-    auto* sums = cand->mutable_left_stats()
-                           ->mutable_regression()
-                           ->mutable_mean_output();
+    auto* sums =
+        cand->mutable_left_stats()->mutable_regression()->mutable_mean_output();
     auto* squares = cand->mutable_left_stats()
                         ->mutable_regression()
                         ->mutable_mean_output_squares();
@@ -891,20 +889,17 @@ float LeastSquaresRegressionGrowStats::SplitVariance(int split) const {
   float total_variance = 0;
   for (int i = 0; i < params_.num_outputs(); ++i) {
     // Left side
-    const float le_x =
-        left_sum(split, i) / left_counts_[split];
+    const float le_x = left_sum(split, i) / left_counts_[split];
 
-    const float le_x2 =
-        left_square(split, i) / left_counts_[split];
+    const float le_x2 = left_square(split, i) / left_counts_[split];
     total_variance += le_x2 - le_x * le_x;
 
     // Right side
     const float re_x = (total_sum_[i] - left_sum(split, i)) /
                        (weight_sum_ - left_counts_[split]);
 
-    const float re_x2 =
-        (total_sum_squares_[i] - left_square(split, i)) /
-        (weight_sum_ - left_counts_[split]);
+    const float re_x2 = (total_sum_squares_[i] - left_square(split, i)) /
+                        (weight_sum_ - left_counts_[split]);
     total_variance += re_x2 - re_x * re_x;
   }
   return total_variance;
@@ -937,8 +932,7 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   left->set_weight_sum(left_counts_[best_index]);
   auto* left_output_sum = left_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    left_output_sum->add_value()->set_float_value(
-        left_sum(best_index, i));
+    left_output_sum->add_value()->set_float_value(left_sum(best_index, i));
   }
 
   // Right
@@ -947,8 +941,8 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   right->set_weight_sum(weight_sum_ - left_counts_[best_index]);
   auto* right_output_sum = right_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    right_output_sum->add_value()->set_float_value(
-        total_sum_[i] - left_sum(best_index, i));
+    right_output_sum->add_value()->set_float_value(total_sum_[i] -
+                                                   left_sum(best_index, i));
   }
   return true;
 }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index 02c0fc687ff..04e6b0a7353 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -73,21 +73,15 @@ class GrowStats {
       const InputTarget* target, int example) {}
   void RemoveSplit(int split_num);
 
-  int num_splits() const {
-    return splits_.size();
-  }
+  int num_splits() const { return splits_.size(); }
 
-  float weight_sum() const {
-    return weight_sum_;
-  }
+  float weight_sum() const { return weight_sum_; }
 
   virtual bool IsInitialized() const {
     return weight_sum_ > 0 || splits_.size() == num_splits_to_consider_;
   }
 
-  int32 depth() const {
-    return depth_;
-  }
+  int32 depth() const { return depth_; }
 
  protected:
   GrowStats(const TensorForestParams& params, int32 depth);
@@ -206,8 +200,8 @@ class ClassificationStats : public GrowStats {
   virtual float left_count(int split, int class_num) const = 0;
   virtual float right_count(int split, int class_num) const = 0;
 
-  virtual void ClassificationAddLeftExample(
-      int split, int64 int_label, float weight) = 0;
+  virtual void ClassificationAddLeftExample(int split, int64 int_label,
+                                            float weight) = 0;
   virtual void ClassificationAddRightExample(int split, int64 int_label,
                                              float weight) {
     // Does nothing by default, but sub-classes can override.
@@ -375,9 +369,7 @@ class SparseClassificationGrowStats : public ClassificationStats {
   SparseClassificationGrowStats(const TensorForestParams& params, int32 depth)
       : ClassificationStats(params, depth) {}
 
-  void Initialize() override {
-    Clear();
-  }
+  void Initialize() override { Clear(); }
 
   void ExtractFromProto(const FertileSlot& slot) override;
   void PackToProto(FertileSlot* slot) const override;
@@ -562,9 +554,9 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   }
   void RemoveSplitStats(int split_num) override {
     left_sums_.erase(left_sums_.begin() + num_outputs_ * split_num,
-                       left_sums_.begin() + num_outputs_ * (split_num + 1));
+                     left_sums_.begin() + num_outputs_ * (split_num + 1));
     left_squares_.erase(left_squares_.begin() + num_outputs_ * split_num,
-                       left_squares_.begin() + num_outputs_ * (split_num + 1));
+                        left_squares_.begin() + num_outputs_ * (split_num + 1));
     left_counts_.erase(left_counts_.begin() + split_num,
                        left_counts_.begin() + (split_num + 1));
   }
@@ -605,7 +597,6 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   std::vector<int64> left_counts_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
index ceb58d2ead5..26e989928e0 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
@@ -24,21 +24,21 @@
 namespace tensorflow {
 namespace {
 
-using tensorflow::tensorforest::GrowStats;
-using tensorflow::tensorforest::TestableInputTarget;
-using tensorflow::tensorforest::FertileSlot;
+using tensorflow::decision_trees::BinaryNode;
+using tensorflow::decision_trees::FeatureId;
+using tensorflow::decision_trees::InequalityTest;
 using tensorflow::tensorforest::DenseClassificationGrowStats;
-using tensorflow::tensorforest::SparseClassificationGrowStats;
+using tensorflow::tensorforest::FertileSlot;
 using tensorflow::tensorforest::FixedSizeClassStats;
 using tensorflow::tensorforest::FixedSizeSparseClassificationGrowStats;
+using tensorflow::tensorforest::GrowStats;
 using tensorflow::tensorforest::LeastSquaresRegressionGrowStats;
-using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::SparseClassificationGrowStats;
 using tensorflow::tensorforest::SPLIT_FINISH_BASIC;
 using tensorflow::tensorforest::SPLIT_FINISH_DOMINATE_HOEFFDING;
 using tensorflow::tensorforest::SPLIT_PRUNE_HOEFFDING;
-using tensorflow::decision_trees::BinaryNode;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::FeatureId;
+using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 BinaryNode MakeSplit(const string& feat, float val) {
   BinaryNode split;
@@ -52,8 +52,7 @@ BinaryNode MakeSplit(const string& feat, float val) {
   return split;
 }
 
-void RunBatch(GrowStats* stats,
-              const TestableInputTarget* target) {
+void RunBatch(GrowStats* stats, const TestableInputTarget* target) {
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
       new tensorflow::tensorforest::TestableDataSet(
           {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2));
@@ -102,18 +101,10 @@ class TestableRunningStats : public DenseClassificationGrowStats {
   TestableRunningStats(const TensorForestParams& params, int32 depth)
       : DenseClassificationGrowStats(params, depth) {}
 
-  float test_left_sum(int split) {
-    return get_left_gini()->sum(split);
-  }
-  float test_left_square(int split) {
-    return get_left_gini()->square(split);
-  }
-  float test_right_sum(int split) {
-    return get_right_gini()->sum(split);
-  }
-  float test_right_square(int split) {
-    return get_right_gini()->square(split);
-  }
+  float test_left_sum(int split) { return get_left_gini()->sum(split); }
+  float test_left_square(int split) { return get_left_gini()->square(split); }
+  float test_right_sum(int split) { return get_right_gini()->sum(split); }
+  float test_right_square(int split) { return get_right_gini()->square(split); }
 };
 
 TEST(GrowStatsDenseClassificationTest, BasicRunningStats) {
@@ -166,9 +157,7 @@ class TestableFinishEarly : public DenseClassificationGrowStats {
   int num_times_called_;
 
  protected:
-  void CheckFinishEarlyHoeffding() override {
-    ++num_times_called_;
-  }
+  void CheckFinishEarlyHoeffding() override { ++num_times_called_; }
 };
 
 TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
@@ -212,7 +201,6 @@ TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
   ASSERT_EQ(stat->num_times_called_, 9);
 }
 
-
 TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   TensorForestParams params;
   params.set_num_outputs(2);
@@ -224,7 +212,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   finish->set_type(SPLIT_FINISH_BASIC);
   finish->mutable_check_every_steps()->set_constant_value(100);
   params.mutable_pruning_type()->set_type(SPLIT_PRUNE_HOEFFDING);
-  params.mutable_pruning_type()->mutable_prune_every_samples()
+  params.mutable_pruning_type()
+      ->mutable_prune_every_samples()
       ->set_constant_value(1);
 
   // On each iteration, we add two examples, one of class 0 and one
@@ -234,8 +223,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   std::vector<float> weights = {1, 1};
   TestableInputTarget target(labels, weights, 1);
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
-      new tensorflow::tensorforest::TestableDataSet(
-          {-1.0, -1.0, 1.0, -1.0}, 2));
+      new tensorflow::tensorforest::TestableDataSet({-1.0, -1.0, 1.0, -1.0},
+                                                    2));
 
   DenseClassificationGrowStats stats(params, 1);
   stats.Initialize();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
index bf0fb924504..d43884481af 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
@@ -109,10 +109,10 @@ void TensorDataSet::set_input_tensors(const Tensor& dense,
     dense_data_.reset(new DenseStorageType(dense.tensor<float, 2>()));
   }
   if (sparse_indices.shape().dims() == 2) {
-    sparse_indices_.reset(new SparseIndicesStorageType(
-        sparse_indices.tensor<int64, 2>()));
-    sparse_values_.reset(new SparseValuesStorageType(
-        sparse_values.tensor<float, 1>()));
+    sparse_indices_.reset(
+        new SparseIndicesStorageType(sparse_indices.tensor<int64, 2>()));
+    sparse_values_.reset(
+        new SparseValuesStorageType(sparse_values.tensor<float, 1>()));
     sparse_batch_size_ = sparse_shape.tensor<int64, 1>()(0);
   }
   original_dense_tensor_ = dense;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index eafad6b5916..c544a8c75e9 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -93,9 +93,7 @@ class TensorDataSet {
   // an int32 you can avoid the atoi32.
   virtual float GetExampleValue(int example, int32 feature_id) const;
 
-  int num_features() {
-    return available_features_.size();
-  }
+  int num_features() { return available_features_.size(); }
 
   const Tensor& original_tensor() const { return original_dense_tensor_; }
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
index 44ec09c50ef..d4402b6055a 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
@@ -79,9 +79,7 @@ class TensorInputTarget : public StoredInputTarget<SingleDimStorageType> {
     return (*target_)(example_index * num_targets_ + target_index);
   }
 
-  const Tensor& original_tensor() const {
-    return original_tensor_;
-  }
+  const Tensor& original_tensor() const { return original_tensor_; }
 
  protected:
   Tensor original_tensor_;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
index d43c068e462..83614a25314 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
@@ -160,6 +160,5 @@ void RegressionLeafModelOperator::ExportModel(
   }
 }
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
index ffd92c01f9a..ab4191809b6 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
@@ -26,19 +26,19 @@ namespace {
 using tensorflow::decision_trees::Leaf;
 using tensorflow::tensorforest::DenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafModelOperator;
-using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
-using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafStat;
 using tensorflow::tensorforest::RegressionLeafModelOperator;
-using tensorflow::tensorforest::TestableInputTarget;
+using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
+using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 const int32 kNumClasses = 3;
 
 constexpr char kRegressionStatProto[] =
-  "weight_sum: 3 "
-  "regression { "
-  "mean_output { "
+    "weight_sum: 3 "
+    "regression { "
+    "mean_output { "
     "value { "
     "  float_value: 27 "
     "} "
@@ -48,8 +48,8 @@ constexpr char kRegressionStatProto[] =
     "value { "
     "  float_value: 10 "
     "} "
-  "} "
-  "mean_output_squares { "
+    "} "
+    "mean_output_squares { "
     "value {"
     "  float_value: 245"
     "}"
@@ -59,8 +59,8 @@ constexpr char kRegressionStatProto[] =
     "value {"
     "  float_value: 46"
     "}"
-  "}"
-"}";
+    "}"
+    "}";
 
 void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   Leaf l;
@@ -83,7 +83,6 @@ void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   EXPECT_FLOAT_EQ(op->GetOutputValue(l, 1), 3.4);
 }
 
-
 TEST(DenseLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
@@ -182,7 +181,7 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
 
   std::unique_ptr<Leaf> leaf(new Leaf);
 
-  op->ExportModel( *stat, leaf.get());
+  op->ExportModel(*stat, leaf.get());
 
   // Make sure it was initialized correctly.
   EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1);
@@ -194,7 +193,6 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
   EXPECT_EQ(leaf->sparse_vector().sparse_value().size(), kNumClasses);
 }
 
-
 TEST(RegressionLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.h b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
index b0ed9494247..7583e3d0402 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
@@ -24,7 +24,6 @@ namespace tensorforest {
 // Return the value of the given depth-dependent parameter given a leaf's depth.
 float ResolveParam(const DepthDependentParam& param, int32 depth);
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
index 801881af136..4010a71006d 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
@@ -71,5 +71,3 @@ TEST(ParamsTest, TestThreshold) {
 }
 
 }  // namespace
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
index cdb1d80a4bb..b7b60d0ab8c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
@@ -52,8 +52,8 @@ std::unique_ptr<GrowStats> SplitCollectionOperator::CreateGrowStats(
           new SparseClassificationGrowStats(params_, depth));
 
     case STATS_LEAST_SQUARES_REGRESSION:
-      return std::unique_ptr<GrowStats>(new LeastSquaresRegressionGrowStats(
-          params_, depth));
+      return std::unique_ptr<GrowStats>(
+          new LeastSquaresRegressionGrowStats(params_, depth));
 
     case STATS_FIXED_SIZE_SPARSE_GINI:
       return std::unique_ptr<GrowStats>(
@@ -136,8 +136,7 @@ void SplitCollectionOperator::CreateAndInitializeCandidateWithExample(
   stats_.at(node_id)->AddSplit(split, input_data, target, example);
 }
 
-bool SplitCollectionOperator::BestSplit(int32 node_id,
-                                        SplitCandidate* best,
+bool SplitCollectionOperator::BestSplit(int32 node_id, SplitCandidate* best,
                                         int32* depth) const {
   auto* slot = stats_.at(node_id).get();
   *depth = slot->depth();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
index ad52f89fadd..c606ff98c67 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
@@ -71,9 +71,7 @@ class SplitCollectionOperator {
   }
 
   // Perform any necessary cleanup for any tracked state for the slot.
-  virtual void ClearSlot(int32 node_id) {
-    stats_.erase(node_id);
-  }
+  virtual void ClearSlot(int32 node_id) { stats_.erase(node_id); }
 
   // Return true if slot is fully initialized.
   virtual bool IsInitialized(int32 node_id) const;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
index 0bec198e97e..c749fbe69e1 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
@@ -32,9 +32,9 @@ namespace tensorforest {
 //   smoothed_sum = stats.sum() + #_classes
 float GiniImpurity(const LeafStat& stats, int32 num_classes) {
   const float smoothed_sum = num_classes + stats.weight_sum();
-  return 1.0 - (
-      (stats.classification().gini().square()
-       + 2 * stats.weight_sum() + num_classes) / (smoothed_sum * smoothed_sum));
+  return 1.0 - ((stats.classification().gini().square() +
+                 2 * stats.weight_sum() + num_classes) /
+                (smoothed_sum * smoothed_sum));
 }
 
 float WeightedGiniImpurity(const LeafStat& stats, int32 num_classes) {
@@ -46,21 +46,20 @@ void UpdateGini(LeafStat* stats, float old_val, float weight) {
   // Equivalent to stats->square() - old_val * old_val + new_val * new_val,
   // (for new_val = old_val + weight), but more numerically stable.
   stats->mutable_classification()->mutable_gini()->set_square(
-      stats->classification().gini().square()
-      + weight * weight + 2 * old_val * weight);
+      stats->classification().gini().square() + weight * weight +
+      2 * old_val * weight);
 }
 
-
 float Variance(const LeafStat& stats, int output) {
   if (stats.weight_sum() == 0) {
     return 0;
   }
   const float e_x =
-      stats.regression().mean_output().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output().value(output).float_value() /
+      stats.weight_sum();
   const auto e_x2 =
-      stats.regression().mean_output_squares().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output_squares().value(output).float_value() /
+      stats.weight_sum();
   return e_x2 - e_x * e_x;
 }
 
@@ -75,8 +74,7 @@ float TotalVariance(const LeafStat& stats) {
 float SmoothedGini(float sum, float square, int num_classes) {
   // See comments for GiniImpurity above.
   const float smoothed_sum = num_classes + sum;
-  return 1.0 -
-         (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
+  return 1.0 - (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
 }
 
 float WeightedSmoothedGini(float sum, float square, int num_classes) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
index 289c81e9d51..38deb3e3cd8 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
@@ -27,9 +27,7 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
       : StoredInputTarget(new std::vector<float>(t), new std::vector<float>(w),
                           num_t) {}
 
-  int NumItems() const {
-    return target_->size();
-  }
+  int NumItems() const { return target_->size(); }
 
   int32 GetTargetAsClassIndex(int example_index,
                               int target_index) const override {
@@ -51,7 +49,6 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
   }
 };
 
-
 class TestableDataSet : public TensorDataSet {
  public:
   TestableDataSet(const std::vector<float>& data, int num_features)
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 6ff5a9e2b18..4175d8adb58 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -40,7 +40,6 @@ cc_library(
     hdrs = ["summary_db_writer.h"],
     copts = tf_copts(),
     deps = [
-        ":schema",
         ":summary_converter",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index c7193cef691..c834430b95d 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from os import path
 
 import numpy
@@ -80,18 +81,19 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       input_statistics: A math_utils.InputStatistics object.
     """
     super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics)
-    self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
-    # Create templates so we don't have to worry about variable reuse.
-    self._lstm_cell_run = tf.make_template(
-        name_="lstm_cell",
-        func_=self._lstm_cell,
-        create_scope_now_=True)
-    # Transforms LSTM output into mean predictions.
-    self._predict_from_lstm_output = tf.make_template(
-        name_="predict_from_lstm_output",
-        func_=
-        lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features),
-        create_scope_now_=True)
+    with tf.variable_scope("", use_resource=True):
+      # Use ResourceVariables to avoid race conditions.
+      self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
+      # Create templates so we don't have to worry about variable reuse.
+      self._lstm_cell_run = tf.make_template(
+          name_="lstm_cell",
+          func_=self._lstm_cell,
+          create_scope_now_=True)
+      # Transforms LSTM output into mean predictions.
+      self._predict_from_lstm_output = tf.make_template(
+          name_="predict_from_lstm_output",
+          func_=functools.partial(tf.layers.dense, units=self.num_features),
+          create_scope_now_=True)
 
   def get_start_state(self):
     """Return initial state for the time series model."""
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 0199313bc8d..a7d54d8a0cc 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -43,6 +43,7 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
+        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 7373d0e17ce..b1ef9fde37f 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -47,6 +48,19 @@ string GetCurrentTimeStampAsString() {
   return s;
 }
 
+Status ValidateHostPortPair(const string& host_port) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos || parts[0].empty()) {
+    return errors::InvalidArgument("Could not interpret \"", host_port,
+                                   "\" as a host-port pair.");
+  }
+  return Status::OK();
+}
+
 ProfileResponse Profile(const string& service_addr, int duration_ms,
                         const ProfileOptions& opts) {
   ProfileRequest request;
@@ -60,11 +74,14 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
   channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
                       std::numeric_limits<int32>::max());
   std::unique_ptr<TPUProfiler::Stub> stub =
       TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-          service_addr, ::grpc::InsecureChannelCredentials(), channel_args));
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
   return response;
@@ -78,14 +95,19 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
   int FLAGS_duration_ms = 2000;
+  int FLAGS_num_tracing_attempts = 3;
   bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
       tensorflow::Flag("logdir", &FLAGS_logdir,
-                       "Path of TensorBoard log directory e.g. /tmp/tb_log"),
+                       "Path of TensorBoard log directory e.g. /tmp/tb_log, "
+                       "gs://tb_bucket"),
       tensorflow::Flag("duration_ms", &FLAGS_duration_ms,
                        "Duration of tracing in ms. Default is 2000ms."),
+      tensorflow::Flag("num_tracing_attempts", &FLAGS_num_tracing_attempts,
+                       "Automatically retry N times when no trace event "
+                       "is collected. Default is 3."),
       tensorflow::Flag("include_dataset_ops", &FLAGS_include_dataset_ops,
                        "Set to false to profile longer TPU device traces."),
   };
@@ -96,16 +118,46 @@ int main(int argc, char** argv) {
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) {
-    std::printf("%s", usage.c_str());
+    std::cout << usage.c_str() << std::endl;
+    return 2;
+  }
+  tensorflow::Status status =
+      tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr);
+  if (!status.ok()) {
+    std::cout << status.error_message() << std::endl;
+    std::cout << usage.c_str() << std::endl;
     return 2;
   }
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  int duration_ms = FLAGS_duration_ms;
+  // Sets the minimum duration_ms and tracing attempts to one.
+  int duration_ms = std::max(FLAGS_duration_ms, 1);
+  int remaining_attempts = std::max(FLAGS_num_tracing_attempts, 1);
   tensorflow::ProfileOptions opts;
   opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
-  tensorflow::ProfileResponse response =
-      tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
+  tensorflow::ProfileResponse response;
+
+  while (true) {
+    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
+    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
+    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    std::cout << "No trace event is collected. Automatically retrying."
+              << std::endl
+              << std::endl;
+  }
+
+  if (response.encoded_trace().empty()) {
+    std::cout << "No trace event is collected after "
+              << FLAGS_num_tracing_attempts << " attempt(s). "
+              << "Perhaps, you want to try again (with more attempts?)."
+              << std::endl
+              << "Tip: increase number of attempts with --num_tracing_attempts."
+              << std::endl;
+    // Don't dump profile data if no trace is collected.
+    return 0;
+  }
+
   // Use the current timestamp as the run name.
   tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
   TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index b842951eb2c..ebd6185faad 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -151,10 +151,7 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
-  const bool empty_trace = response.encoded_trace().empty();
-  if (empty_trace) {
-    *os << "No trace event is collected." << std::endl;
-  } else {
+  if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
     TF_RETURN_IF_ERROR(
         DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
@@ -165,11 +162,9 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
     TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
                                                    response.op_profile(), os));
   }
-  if (!empty_trace && !response.tool_data().empty()) {
-    for (const auto& tool_data : response.tool_data()) {
-      TF_RETURN_IF_ERROR(
-          DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
-    }
+  for (const auto& tool_data : response.tool_data()) {
+    TF_RETURN_IF_ERROR(
+        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
   }
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 25b958bcfea..29ef977bacf 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -27,7 +27,10 @@ namespace tpu {
 // The following tools are supported:
 //   - Trace viewer
 //   - Op profile
-//   - HLO computation graph
+//   - Input pipeline analyzer
+//   - Overview page
+// Note: this function creates a directory even when all fields in
+// ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 846db133299..78d237e6a20 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -17,6 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from absl import flags
 
 import os
 import subprocess
@@ -24,13 +25,21 @@ import sys
 
 import tensorflow as tf
 
-tf.flags.DEFINE_string('service_addr', '',
-                       'Address of TPU profiler service e.g. localhost:8466')
-tf.flags.DEFINE_string('logdir', '',
-                       'Path of TensorBoard log directory e.g. /tmp/tb_log')
-tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_string(
+    'service_addr', None, 'Address of TPU profiler service e.g. '
+    'localhost:8466')
+flags.DEFINE_string(
+    'logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, '
+    'gs://tb_bucket')
+flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_integer(
+    'num_tracing_attempts', 3, 'Automatically retry N times when no trace '
+    'event is collected.')
+flags.DEFINE_boolean(
+    'include_dataset_ops', True, 'Set to false to profile longer TPU '
+    'device traces.')
 
-FLAGS = tf.flags.FLAGS
+FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
 
 
@@ -47,6 +56,8 @@ def main(unused_argv=None):
   cmd.append('--logdir='+logdir)
   cmd.append('--service_addr='+FLAGS.service_addr)
   cmd.append('--duration_ms='+str(FLAGS.duration_ms))
+  cmd.append('--num_tracing_attempts='+str(FLAGS.num_tracing_attempts))
+  cmd.append('--include_dataset_ops='+str(FLAGS.include_dataset_ops).lower())
   subprocess.call(cmd)
 
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index 92196638318..3dffebe6680 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.4.3-a2'
+_VERSION = '1.5.0-rc1'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
index 0f645a54929..dc6a9348911 100644
--- a/tensorflow/contrib/tpu/profiler/version.h
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
 
-#define TPU_PROFILER_VERSION "1.4.3"
+#define TPU_PROFILER_VERSION "1.5.0"
 
 #endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 8fec379aad8..d5f54ff4fd2 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -153,10 +153,11 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
           "(operator name: %s)" % op.name)
-    # pylint: enable=protected-access
     if _TPU_REPLICATE_ATTR in op.node_def.attr:
       raise ValueError("TPU computations cannot be nested")
-    op.node_def.attr[_TPU_REPLICATE_ATTR].s = compat.as_bytes(self._name)
+    op._set_attr(_TPU_REPLICATE_ATTR,
+                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
+    # pylint: enable=protected-access
     op.graph.prevent_feeding(op)
     op.graph.prevent_fetching(op)
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 0c2580211ab..188db6e2f0d 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -53,7 +53,8 @@ class TPUConfig(
     num_shards: The number of TPU shards in the system.
     per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
       rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
-      is invoked once on each host. To be precise, with a global batch size
+      is invoked once on each host. With Per-Core input pipeline deployment, it
+      is invoked once for each core. To be precise, with a global batch size
       `train_batch_size` in `TPUEstimator` constructor, the batch size for each
       shard is `train_batch_size` // #hosts. With Per-Core input pipeline
       deployment, the shard batch size is `train_batch_size` // #cores.
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 2ae3a26a853..56793f11d9c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 from contextlib import contextmanager
 import copy
+import signal
 import threading
 import time
 import traceback
@@ -29,6 +30,7 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -39,11 +41,13 @@ from tensorflow.contrib.tpu.python.tpu import util as util_lib
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -59,6 +63,7 @@ from tensorflow.python.training import evaluation
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
+from tensorflow.python.util import tf_inspect
 
 _INITIAL_LOSS = 1e7
 _ZERO_LOSS = 0.
@@ -68,7 +73,12 @@ _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
-# TODO(b/65703635): Flip the value and remove all dead code.
+
+# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
+# only used for per-core based deployments. For per-host based pipelines, if a
+# user returns a Dataset instance it will be automatically wrapped in a
+# tf.while_loop (This can be disabled by returning features and labels
+# explicitly).
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
@@ -162,10 +172,12 @@ class _TPUContext(object):
   ```
   """
 
-  def __init__(self, config, train_batch_size, eval_batch_size, use_tpu):
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
     self._config = config
     self._train_batch_size = train_batch_size
     self._eval_batch_size = eval_batch_size
+    self._predict_batch_size = predict_batch_size
     self._use_tpu = use_tpu
     self._num_shards_or_none = self._config.tpu_config.num_shards
     self._mode = None
@@ -210,39 +222,66 @@ class _TPUContext(object):
     return (self._mode == model_fn_lib.ModeKeys.TRAIN and
             not self._config.tpu_config.per_host_input_for_training)
 
-  def is_running_on_cpu(self):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
+  def is_running_on_cpu(self, is_export_mode=False):
+    """Determines whether the input_fn and model_fn should be invoked on CPU.
+
+    Args:
+      is_export_mode: Indicates whether the current mode is for exporting the
+        model, when mode == PREDICT. Only with this bool, we could
+        tell whether user is calling the Estimator.predict or
+        Estimator.export_savedmodel, which are running on TPU and CPU
+        respectively. Parent class Estimator does not distingush these two.
+
+    Returns:
+      bool, whether current input_fn or model_fn should be running on CPU.
+
+    Raises:
+      ValueError: any configuration is invalid.
+    """
     mode = self._assert_mode()
-    return ((not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT or
-            (mode == model_fn_lib.ModeKeys.EVAL and
-             self._eval_batch_size is None))
+
+    if not self._use_tpu:
+      return True
+
+    if mode != model_fn_lib.ModeKeys.PREDICT:
+      return False
+
+    # There are actually 2 use cases when running with mode.PREDICT: prediction
+    # and saving the model.  We run actual predictions on the TPU, but
+    # model export is run on the CPU.
+    if is_export_mode:
+      return True
+
+    if self._predict_batch_size is None:
+      raise ValueError(
+          'predict_batch_size in TPUEstimator constructor should not be '
+          '`None` if .predict is running on TPU.')
+    if self.num_hosts > 1:
+      raise ValueError(
+          'TPUEstimator.predict should be running on single host.')
+
+    return False
 
   @property
   def global_batch_size(self):
     mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      raise RuntimeError('Internal error, EVAL on TPU is not enabled, but '
-                         '`global_batch_size` is called.')
-    return (self._train_batch_size
-            if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      return self._train_batch_size
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return self._eval_batch_size
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return self._predict_batch_size
+    else:
+      return None
 
   @property
   def batch_size_for_input_fn(self):
     """Returns the shard batch size for `input_fn`."""
-    mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
-    if self.is_running_on_cpu():
-      if mode == model_fn_lib.ModeKeys.TRAIN:
-        return self._train_batch_size
-      if mode == model_fn_lib.ModeKeys.EVAL:
-        return self._eval_batch_size
-      return None
+    global_batch_size = self.global_batch_size
+
+    if self.is_running_on_cpu():
+      return global_batch_size
 
-    global_batch_size = (
-        self._train_batch_size
-        if mode == model_fn_lib.ModeKeys.TRAIN else self._eval_batch_size)
     # On TPU
     if self.is_input_sharded_per_core():
       return global_batch_size // self.num_cores
@@ -252,22 +291,13 @@ class _TPUContext(object):
   @property
   def batch_size_for_model_fn(self):
     """Returns the shard batch size for `model_fn`."""
-    mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
+    global_batch_size = self.global_batch_size
+
     if self.is_running_on_cpu():
-      if mode == model_fn_lib.ModeKeys.TRAIN:
-        return self._train_batch_size
-      if mode == model_fn_lib.ModeKeys.EVAL:
-        return self._eval_batch_size
-      return None
+      return global_batch_size
 
     # On TPU. always sharded per core.
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size // self.num_cores
-    else:
-      return self._eval_batch_size // self.num_cores
+    return global_batch_size // self.num_cores
 
   @property
   def master_job(self):
@@ -384,7 +414,8 @@ class TPUEstimatorSpec(
         'train_op',
         'eval_metrics',
         'export_outputs',
-        'scaffold_fn'
+        'scaffold_fn',
+        'host_call'
     ])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
@@ -410,6 +441,15 @@ class TPUEstimatorSpec(
 
   `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
   function should not capture any Tensors in `model_fn`.
+
+  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
+  to pass to that function and returns a list of Tensors. `host_call` currently
+  works for train() and evaluate(). The Tensors returned by the function is
+  executed on the CPU on every step, so there is communication overhead when
+  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
+  size of the tensors. The `tensors` are concatenated along their major (batch)
+  dimension, and so must be >= rank 1. The `host_call` is useful for writing
+  summaries with @{tf.contrib.summary.create_file_writer}.
   """
 
   def __new__(cls,
@@ -419,10 +459,15 @@ class TPUEstimatorSpec(
               train_op=None,
               eval_metrics=None,
               export_outputs=None,
-              scaffold_fn=None):
+              scaffold_fn=None,
+              host_call=None):
     """Creates a validated `TPUEstimatorSpec` instance."""
+    host_calls = {}
     if eval_metrics is not None:
-      _EvalMetrics.validate(eval_metrics)
+      host_calls['eval_metrics'] = eval_metrics
+    if host_call is not None:
+      host_calls['host_call'] = host_call
+    _OutfeedHostCall.validate(host_calls)
     return super(TPUEstimatorSpec, cls).__new__(
         cls,
         mode=mode,
@@ -431,12 +476,23 @@ class TPUEstimatorSpec(
         train_op=train_op,
         eval_metrics=eval_metrics,
         export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn)
+        scaffold_fn=scaffold_fn,
+        host_call=host_call)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    eval_metric_ops = _EvalMetrics.to_metric_metric_ops_for_cpu(
-        self.eval_metrics)
+    host_calls = {}
+    if self.eval_metrics is not None:
+      host_calls['eval_metrics'] = self.eval_metrics
+    if self.host_call is not None:
+      host_calls['host_call'] = wrap_hostcall_with_global_step(self.host_call)
+    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
+    eval_metric_ops = None
+    if self.eval_metrics is not None:
+      eval_metric_ops = host_call_ret['eval_metrics']
+    hooks = None
+    if self.host_call is not None:
+      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -445,7 +501,10 @@ class TPUEstimatorSpec(
         train_op=self.train_op,
         eval_metric_ops=eval_metric_ops,
         export_outputs=self.export_outputs,
-        scaffold=scaffold)
+        scaffold=scaffold,
+        training_hooks=hooks,
+        evaluation_hooks=hooks,
+        prediction_hooks=hooks)
 
 
 class _OpQueueContext(object):
@@ -467,12 +526,12 @@ class _OpQueueContext(object):
 
   def read_iteration_counts(self):
     while True:
-      signal = self._queue.get(block=True)
-      logging.debug('%s read signal %s', self._name, signal)
-      if signal == _SIGNAL.STOP:
-        logging.info('%s received signal, stopping.', self._name)
+      iterations = self._queue.get(block=True)
+      logging.debug('%s read iterations %s', self._name, iterations)
+      if iterations == _SIGNAL.STOP:
+        logging.info('%s received shutdown signal, stopping.', self._name)
         return
-      yield signal
+      yield iterations
 
   def join(self):
     logging.info('Shutting down %s thread.' % self._name)
@@ -480,6 +539,22 @@ class _OpQueueContext(object):
     self._thread.join()
 
 
+class _OpSignalOnceQueueContext(_OpQueueContext):
+  """Manages work queue and thread for a infeed/outfeed thread.
+
+  This subclass only signals once.
+  """
+
+  def __init__(self, name, target, args):
+    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
+    self._has_signaled = False
+
+  def send_next_batch_signal(self, iterations):
+    if not self._has_signaled:
+      self._queue.put(iterations)
+      self._has_signaled = True
+
+
 class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   """A Session hook setting up the TPU initialization, infeed, and outfeed.
 
@@ -489,12 +564,19 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
      dequeue.
   """
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops=None):
+  def __init__(self,
+               ctx,
+               enqueue_ops,
+               dequeue_ops,
+               run_infeed_loop_on_coordinator=True):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
+
+    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
+
     self._session_cancel_timer = None
 
     self._feed_error = None
@@ -503,8 +585,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
     self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_op = [tpu.initialize_system(job=self._master_job)]
-    self._finalize_op = [tpu.shutdown_system(job=self._master_job)]
+    self._init_ops = [tpu.initialize_system(job=self._master_job)]
+    self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
+
+    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
+    self._init_ops.extend(summary_writer_init_ops)
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    for op in summary_writer_init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
 
   def _log_error(self, session, error):
     """Log an infeed or outfeed error.
@@ -516,8 +605,9 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     emitting a stack trace for the infeed.
 
     Args:
-      session: `tf.Session`, session to be terminated
-      error: exception that triggered logging.
+      session: `tf.Session`, session to be terminated error: exception that
+        triggered logging.
+      error: the Exception to log.
     """
     logging.warning(
         '\n\n'
@@ -569,15 +659,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
       logging.info('%s thread starting after sleep', self._name)
 
     try:
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        for _ in queue_ctx.read_iteration_counts():
-          session.run(self._enqueue_ops)
-      else:
+      if self._run_infeed_loop_on_coordinator:
         for count, steps in enumerate(queue_ctx.read_iteration_counts()):
           for i in xrange(steps):
             logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
             session.run(self._enqueue_ops)
-      logging.debug('Infeed thread finished, shutting down.')
+      else:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      logging.info('Infeed thread finished, shutting down.')
     except Exception as e:  # pylint: disable=broad-except
       self._log_error(session, e)
 
@@ -588,23 +678,25 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
         for i in xrange(steps):
           logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
           session.run(self._dequeue_ops)
+      logging.info('Outfeed thread finished, shutting down.')
     except Exception as e:  # pylint: disable=broad-except
       self._log_error(session, e)
 
+  def _create_infeed_controller(self, name, target, args):
+    return _OpQueueContext(name=name, target=target, args=args)
+
   def after_create_session(self, session, coord):
     logging.info('Init TPU system')
-    session.run(
-        self._init_op,
-        options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
+    session.run(self._init_ops,
+                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
     logging.info('Start infeed thread controller')
-    self._infeed_controller = _OpQueueContext(
+    self._infeed_controller = self._create_infeed_controller(
         name='InfeedController', target=self._run_infeed, args=(session,))
 
-    if self._dequeue_ops is not None:
-      logging.info('Start outfeed thread controller')
-      self._outfeed_controller = _OpQueueContext(
-          name='OutfeedController', target=self._run_outfeed, args=(session,))
+    logging.info('Start outfeed thread controller')
+    self._outfeed_controller = _OpQueueContext(
+        name='OutfeedController', target=self._run_outfeed, args=(session,))
 
   def before_run(self, run_context):
     if self._feed_error:
@@ -617,11 +709,9 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
     self._infeed_controller.send_next_batch_signal(iterations)
 
-    if self._dequeue_ops is not None:
-      # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
-      logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
-                   iterations)
-      self._outfeed_controller.send_next_batch_signal(iterations)
+    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                 iterations)
+    self._outfeed_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
     if self._session_cancel_timer:
@@ -632,12 +722,21 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     logging.info('Stop infeed thread controller')
     self._infeed_controller.join()
 
-    if self._dequeue_ops is not None:
-      logging.info('Stop output thread controller')
-      self._outfeed_controller.join()
+    logging.info('Stop output thread controller')
+    self._outfeed_controller.join()
 
     logging.info('Shutdown TPU system.')
-    session.run(self._finalize_op)
+    session.run(self._finalize_ops)
+
+
+class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
+
+  def __init__(self, ctx, enqueue_ops, dequeue_ops):
+    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
+        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False)
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
 
 
 class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
@@ -727,6 +826,47 @@ class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
     self._iterations_per_loop_var.load(self._num_steps, session=session)
 
 
+class _StoppingPredictHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop according to the stopping signal in prediction."""
+
+  def __init__(self, scalar_stopping_signal):
+    self._scalar_stopping_signal = scalar_stopping_signal
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
+    # in side threads for prediction model. But it makes the
+    # TPUInfeedOutfeedSessionHook prints nice message.
+    self._iterations_per_loop_var.load(1, session=session)
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+    scalar_stopping_signal = run_values.results
+    if _StopSignals.should_stop(scalar_stopping_signal):
+      # NOTE(xiejw): In prediction, stopping signals are inserted for each
+      # batch. And we append one more batch to signal the system it should stop.
+      # The data flow might look like
+      #
+      #  batch   0: images, labels, stop = 0  (user provideded)
+      #  batch   1: images, labels, stop = 0  (user provideded)
+      #  ...
+      #  batch  99: images, labels, stop = 0  (user provideded)
+      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
+      #
+      # where the final batch (id = 100) is appended by TPUEstimator, so we
+      # should drop it before returning the predictions to user.
+      # To achieve that, we throw the OutOfRangeError in after_run. Once
+      # Monitored Session sees this error in SessionRunHook.after_run, the
+      # "current" prediciton, i.e., batch with id=100, will be discarded
+      # immediately
+      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
+
+
 def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
                                               inputs_structure_recorder):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
@@ -738,11 +878,14 @@ def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
     per_host_sharded_inputs = []
     for core_ordinal in range(num_cores_per_host):
       with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        inputs = input_fn()
-        if isinstance(inputs, tuple):
-          features, labels = inputs
-        else:
-          features, labels = inputs, None
+        inputs = _Inputs.from_input_fn(input_fn())
+        if inputs.is_dataset:
+          raise TypeError(
+              '`input_fn` returning `Dataset`  is not yet supported in '
+              'per-Core input pipeline deployment yet. Please set '
+              'TPUConfig.per_host_input_for_training to True or return '
+              '`features` and `labels` from `input_fn`')
+        features, labels = inputs.features_and_labels()
 
         inputs_structure_recorder.validate_and_record_structure(
             features, labels)
@@ -769,18 +912,37 @@ def generate_per_host_enqueue_ops_fn_for_host(
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
+  hooks = []
+
+  with ops.device(device):
+    inputs = _Inputs.from_input_fn(input_fn())
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn)
+
+    if is_dataset:
+      hooks.append(inputs.dataset_initializer_hook())
+
   def enqueue_ops_fn():
     with ops.device(device):
       num_cores_per_host = ctx.num_of_cores_per_host
-      inputs = input_fn()
-      if isinstance(inputs, tuple):
-        features, labels = inputs
-      else:
-        features, labels = inputs, None
-      inputs_structure_recorder.validate_and_record_structure(features, labels)
+      # Convert user input to features and labels.  If the user returns a
+      # dataset, it is initialized and the features and labels extracted via
+      # `dataset.iterator.get_next()`
+      features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      inputs_structure_recorder.validate_and_record_structure(
+          features, labels, signals)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
-              features, labels))
+              features, labels, signals))
 
       infeed_queue = tpu_feed.InfeedQueue(
           tuple_types=[t.dtype for t in unsharded_tensor_list],
@@ -792,9 +954,15 @@ def generate_per_host_enqueue_ops_fn_for_host(
       per_host_enqueue_ops = (
           infeed_queue.split_inputs_and_generate_enqueue_ops(
               unsharded_tensor_list, placement_function=lambda x: device))
-      return per_host_enqueue_ops
+      if signals is None:
+        return per_host_enqueue_ops
+      else:
+        return {
+            'ops': per_host_enqueue_ops,
+            'signals': signals,
+        }
 
-  return enqueue_ops_fn, captured_infeed_queue
+  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
 
 class _InputPipeline(object):
@@ -834,6 +1002,7 @@ class _InputPipeline(object):
       self._feature_names = []
       self._label_names = []
       self._has_labels = False
+      self._signals_helper = None
 
       # Internal state.
       self._initialized = False
@@ -841,7 +1010,7 @@ class _InputPipeline(object):
     def has_labels(self):
       return self._has_labels
 
-    def validate_and_record_structure(self, features, labels):
+    def validate_and_record_structure(self, features, labels, signals=None):
       """Validates and records the structure of features` and `labels`."""
 
       def _extract_key_names(tensor_or_dict):
@@ -854,6 +1023,10 @@ class _InputPipeline(object):
       feature_names = _extract_key_names(features)
       label_names = _extract_key_names(labels)
 
+      if signals is not None and self._signals_helper is None:
+        # Record signals helper.
+        self._signals_helper = _SignalsHelper(signals)
+
       if self._initialized:
         # Verify the structure is same. The following should never happen.
         assert feature_names == self._feature_names, 'feature keys mismatched'
@@ -866,7 +1039,7 @@ class _InputPipeline(object):
         self._label_names = label_names
         self._has_labels = has_labels
 
-    def flatten_features_and_labels(self, features, labels):
+    def flatten_features_and_labels(self, features, labels, signals=None):
       """Flattens the `features` and `labels` to a single tensor list."""
       flattened_inputs = []
       if self._feature_names:
@@ -882,6 +1055,9 @@ class _InputPipeline(object):
           flattened_inputs.extend([labels[name] for name in self._label_names])
         else:
           flattened_inputs.append(labels)
+
+      if signals is not None:
+        flattened_inputs.extend(_SignalsHelper.as_tensor_list(signals))
       return flattened_inputs
 
     def unflatten_features_and_labels(self, flattened_inputs):
@@ -907,7 +1083,11 @@ class _InputPipeline(object):
       else:
         expected_num_labels = 0
 
-      expected_num_tensors = expected_num_features + expected_num_labels
+      expected_num_signals = (
+          self._signals_helper.num_signals if self._signals_helper else 0)
+
+      expected_num_tensors = (
+          expected_num_features + expected_num_labels + expected_num_signals)
 
       if expected_num_tensors != len(flattened_inputs):
         raise ValueError(
@@ -924,13 +1104,20 @@ class _InputPipeline(object):
       if expected_num_labels == 0:
         unflattened_label = None
       elif self._label_names:
-        unflattened_label = dict(
-            zip(self._label_names, flattened_inputs[expected_num_features:]))
+        label_list = flattened_inputs[
+            expected_num_features:expected_num_features + expected_num_labels]
+        unflattened_label = dict(zip(self._label_names, label_list))
       else:
         # Single tensor case.
         unflattened_label = flattened_inputs[expected_num_features]
 
-      return unflattened_features, unflattened_label
+      signals = None
+      if expected_num_signals != 0:
+        tensor_list_for_signals = flattened_inputs[
+            expected_num_features + expected_num_labels:]
+        signals = self._signals_helper.unflatten(tensor_list_for_signals)
+
+      return _Inputs(unflattened_features, unflattened_label, signals=signals)
 
   def __init__(self, input_fn, batch_axis, ctx):
     """Constructor.
@@ -958,7 +1145,8 @@ class _InputPipeline(object):
     # While tf.while_loop is called, the body function, which invokes
     # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
     # structure is recorded.
-    enqueue_ops = self._invoke_input_fn_and_record_structure()
+    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
+        self._invoke_input_fn_and_record_structure())
 
     self._validate_input_pipeline()
 
@@ -969,14 +1157,18 @@ class _InputPipeline(object):
       return self._inputs_structure_recorder.unflatten_features_and_labels(
           values)
 
-    return (enqueue_ops, dequeue_fn)
+    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
 
   def _invoke_input_fn_and_record_structure(self):
     """Deploys the input pipeline and record input structure."""
     enqueue_ops = []
     infeed_queues = []
+    all_hooks = []
     num_hosts = self._ctx.num_hosts
     tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+
+    run_infeed_loop_on_coordinator = True
+
     if self._sharded_per_core:
       # Per-Core input pipeline deployment.
       # Invoke input pipeline for each core and placed on the corresponding
@@ -990,6 +1182,7 @@ class _InputPipeline(object):
                     self._ctx, self._input_fn, self._inputs_structure_recorder))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+              run_infeed_loop_on_coordinator = False
               enqueue_ops.append(
                   _wrap_computation_in_while_loop(
                       device=host_device, op_fn=enqueue_ops_fn))
@@ -1003,15 +1196,32 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
+            enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
                 generate_per_host_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder,
                     self._batch_axis, host_device))
+            all_hooks.extend(hooks)
 
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+            # NOTE(xiejw): We dispatch here based on the return type of the
+            # users `input_fn`.
+            #
+            # 1. If input_fn returns a Dataset instance, we initialize the
+            # iterator outside of tf.while_loop, and call the iterator.get_next
+            # inside tf.while_loop.  This should be always safe.
+            #
+            # 2. If input_fn returns (features, labels), it is too late to wrap
+            # them inside tf.while_loop, as resource initialization cannot be
+            # handled in TF control flow properly. In this case, we will use
+            # python loop to enqueue the data into TPU system.  This may be
+            # slow compared to the previous case.
+            if is_dataset:
+              run_infeed_loop_on_coordinator = False
+              wrap_fn = (
+                  _wrap_computation_in_while_loop
+                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+                  _wrap_computation_in_while_loop_with_stopping_signals)
               enqueue_ops.append(
-                  _wrap_computation_in_while_loop(
-                      device=host_device, op_fn=enqueue_ops_fn))
+                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
             infeed_queues.append(captured_infeed_queue.get())
@@ -1019,7 +1229,7 @@ class _InputPipeline(object):
     # dequeue is dtypes and types. So, any one can be used. Here, grab the
     # first one.
     self._infeed_queue = infeed_queues[0]
-    return enqueue_ops
+    return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator
 
   def _validate_input_pipeline(self):
     # Perform some sanity checks to log user friendly information. We should
@@ -1076,29 +1286,38 @@ class _ModelFnWrapper(object):
         infeed dequeue channel.
 
     Returns:
-      A Fn representing the train step for TPU.
+      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
+      representing the train step for TPU.
     """
 
+    host_call = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
 
     def train_step(loss):
       """Training step function for use inside a while loop."""
       del loss  # unused; required in function signature.
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       estimator_spec = self._verify_estimator_spec(
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
 
+      host_call_outfeed_ops = []
       if isinstance(estimator_spec, TPUEstimatorSpec):
         captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+        if estimator_spec.host_call is not None:
+          host_call.record({
+              'host_call': wrap_hostcall_with_global_step(
+                  estimator_spec.host_call)})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
       else:
         captured_scaffold_fn.capture(None)
 
-      with ops.control_dependencies([train_op]):
+      with ops.control_dependencies([train_op] + host_call_outfeed_ops):
         return array_ops.identity(loss)
 
-    return train_step, captured_scaffold_fn
+    return train_step, host_call, captured_scaffold_fn
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single eval step on TPU.
@@ -1123,15 +1342,16 @@ class _ModelFnWrapper(object):
         infeed dequeue channel.
 
     Returns:
-      A tuple of eval_fn and eval_metrics. The eval_fn representing the eval
-      step for TPU. and eval_metrics is an `_EvalMetrics` instance.
+      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
+      representing the eval step for TPU.
     """
-    eval_metrics = _EvalMetrics(self._ctx)
+    host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
 
     def eval_step(total_loss):
       """Evaluation step function for use inside a while loop."""
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       tpu_estimator_spec = self._call_model_fn(features, labels)
       if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
@@ -1141,15 +1361,68 @@ class _ModelFnWrapper(object):
 
       loss = tpu_estimator_spec.loss
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      eval_metrics.record(tpu_estimator_spec)
-      outfeed_ops = tpu_ops.outfeed_enqueue_tuple(eval_metrics.outfeed_tensors)
+      to_record = {}
+      to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
+      if tpu_estimator_spec.host_call is not None:
+        # We assume that evaluate won't update global step, so we don't wrap
+        # this host_call.
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
 
-      with ops.control_dependencies([outfeed_ops]):
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
         return math_ops.add(total_loss, loss)
 
-    return eval_step, eval_metrics, captured_scaffold_fn
+    return eval_step, host_calls, captured_scaffold_fn
 
-  def _call_model_fn(self, features, labels):
+  def convert_to_single_tpu_predict_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single predict step on TPU.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
+      predict_fn representing the predict step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+
+    def predict_step(unused_scalar_stopping_signal):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      stopping_signals = inputs.signals()
+
+      assert stopping_signals is not None, (
+          'Internal Error: `signals` is missing.')
+
+      tpu_estimator_spec = self._call_model_fn(
+          features, labels, is_export_mode=False)
+      if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
+        raise RuntimeError(
+            'estimator_spec used by TPU prediction must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      to_record = {}
+      identity_fn = lambda **kwargs: kwargs
+      # TODO(xiejw): Adds validation for prediction dictionrary.
+      # TODO(xiejw): Adds support for single tensor as predictions.
+      if not isinstance(tpu_estimator_spec.predictions, dict):
+        raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
+      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
+      to_record['signals'] = [identity_fn, stopping_signals]
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
+
+    return predict_step, host_calls, captured_scaffold_fn
+
+  def _call_model_fn(self, features, labels, is_export_mode=True):
     """Calls the model_fn with required parameters."""
     model_fn_args = util.fn_args(self._model_fn)
     kwargs = {}
@@ -1180,7 +1453,7 @@ class _ModelFnWrapper(object):
       params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
     estimator_spec = self._model_fn(features=features, **kwargs)
-    if (self._ctx.is_running_on_cpu() and
+    if (self._ctx.is_running_on_cpu(is_export_mode) and
         isinstance(estimator_spec, TPUEstimatorSpec)):
       # The estimator_spec will be passed to `Estimator` directly, which expects
       # type `EstimatorSpec`.
@@ -1207,158 +1480,241 @@ class _ModelFnWrapper(object):
     return estimator_spec
 
 
-class _EvalMetrics(object):
-  """Class wraps TPUEstimator.eval_metrics."""
+class _OutfeedHostCall(object):
+  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
 
   def __init__(self, ctx):
     self._ctx = ctx
-    self._metric_fn = None
-    self._is_dict = False
-    self._tensor_keys = []
-    self._tensors = []
-    self._tensor_dtypes = []
-    self._tensor_shapes = []
-    self._recorded = False
+    self._names = []
+    # All of these are dictionaries of lists keyed on the name.
+    self._host_fns = {}
+    self._tensor_keys = collections.defaultdict(list)
+    self._tensors = collections.defaultdict(list)
+    self._tensor_dtypes = collections.defaultdict(list)
+    self._tensor_shapes = collections.defaultdict(list)
 
   @staticmethod
-  def validate(eval_metrics):
-    """Validates the `eval_metrics` in `TPUEstimatorSpec`."""
+  def validate(host_calls):
+    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
 
-    if not isinstance(eval_metrics, (tuple, list)):
-      raise ValueError('eval_metrics should be tuple or list')
-    if len(eval_metrics) != 2:
-      raise ValueError('eval_metrics should have two elements.')
-    if not callable(eval_metrics[0]):
-      raise TypeError('eval_metrics[0] should be callable.')
-    if not isinstance(eval_metrics[1], (tuple, list, dict)):
-      raise ValueError('eval_metrics[1] should be tuple or list, or dict.')
+    for name, host_call in host_calls.items():
+      if not isinstance(host_call, (tuple, list)):
+        raise ValueError('{} should be tuple or list'.format(name))
+      if len(host_call) != 2:
+        raise ValueError('{} should have two elements.'.format(name))
+      if not callable(host_call[0]):
+        raise TypeError('{}[0] should be callable.'.format(name))
+      if not isinstance(host_call[1], (tuple, list, dict)):
+        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
 
-    if isinstance(eval_metrics[1], (tuple, list)):
-      fn_args = util.fn_args(eval_metrics[0])
-      if len(eval_metrics[1]) != len(fn_args):
-        raise RuntimeError(
-            'In TPUEstimatorSpec.eval_metrics, length of tensors does not '
-            'match method args of metric_fn.')
+      if isinstance(host_call[1], (tuple, list)):
+        fullargspec = tf_inspect.getfullargspec(host_call[0])
+        fn_args = util.fn_args(host_call[0])
+        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
+        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
+          raise RuntimeError(
+              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
+              'method args of the function, which takes {}.'.format(
+                  name, len(host_call[1]), len(fn_args)))
 
   @staticmethod
-  def to_metric_metric_ops_for_cpu(eval_metrics):
-    """Converts `TPUEstimatorSpec.eval_metrics` to `eval_metric_ops` for CPU."""
-    if not eval_metrics:
-      return None
+  def create_cpu_hostcall(host_calls):
+    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
 
-    _EvalMetrics.validate(eval_metrics)
+    _OutfeedHostCall.validate(host_calls)
+    ret = {}
+    for name, host_call in host_calls.items():
+      host_fn, tensors = host_call
+      if isinstance(tensors, (tuple, list)):
+        ret[name] = host_fn(*tensors)
+      else:
+        # Must be dict.
+        try:
+          ret[name] = host_fn(**tensors)
+        except TypeError as e:
+          logging.warning(
+              'Exception while calling %s: %s. It is likely the tensors '
+              '(%s[1]) do not match the '
+              'function\'s arguments', name, e, name)
+          raise e
+    return ret
 
-    metric_fn, tensors = eval_metrics
+  def record(self, host_calls):
+    """Records the host_call structure."""
 
-    if isinstance(tensors, (tuple, list)):
-      return metric_fn(*tensors)
-    else:
-      # Must be dict.
-      try:
-        return metric_fn(**tensors)
-      except TypeError as e:
-        logging.warning(
-            'Exception while calling metric_fn for evalution: %s. '
-            'It is likely the tensors (eval_metrics[1]) do not match the '
-            'metric_fn arguments', e)
-        raise e
+    for name, host_call in host_calls.items():
+      host_fn, tensor_list_or_dict = host_call
+      self._names.append(name)
+      self._host_fns[name] = host_fn
 
-  def record(self, spec):
-    """Records the eval_metrics structure in `spec`."""
-    if self._recorded:
-      raise RuntimeError('Eval metrics have been recorded already.')
+      if isinstance(tensor_list_or_dict, dict):
+        for (key, tensor) in six.iteritems(tensor_list_or_dict):
+          self._tensor_keys[name].append(key)
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+      else:
+        # List or tuple.
+        self._tensor_keys[name] = None
+        for tensor in tensor_list_or_dict:
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
 
-    self._metric_fn, tensor_list_or_dict = spec.eval_metrics
-
-    if isinstance(tensor_list_or_dict, dict):
-      self._is_dict = True
-      for (key, tensor) in six.iteritems(tensor_list_or_dict):
-        self._tensor_keys.append(key)
-        self._tensors.append(tensor)
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    else:
-      # List or tuple.
-      self._is_dict = False
-      self._tensors = tensor_list_or_dict
-      for tensor in tensor_list_or_dict:
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    self._recorded = True
-
-  @property
-  def outfeed_tensors(self):
-    if not self._recorded:
-      raise RuntimeError('Eval metrics have not been recorded yet')
-    return self._tensors
-
-  def to_metric_metric_ops_for_tpu(self, dummy_update_op):
-    """Creates the eval_metric_ops now based on the TPU outfeed.
-
-    `eval_metric_ops` is defined in `EstimatorSpec`. From all shards, tensors
-    are dequeued from outfeed and then concatenated (along batch size dimension)
-    to form  global-like tensors. All global-like tensors are passed to the
-    metric fn.
-
-    Args:
-      dummy_update_op: A dummy update op.
+  def create_enqueue_op(self):
+    """Create the op to enqueue the recorded host_calls.
 
     Returns:
-      A tuple of (`eval_metric_ops` and `update_ops`), where `update_ops` should
-      be invoked in Outfeed dequeue thread, which drive the outfeed dequeue and
-      update the state of metrics.
+      A list of enqueue ops, which is empty if there are no host calls.
+    """
+    if not self._names:
+      return []
+
+    tensors = []
+    # TODO(jhseu): Consider deduping tensors.
+    for name in self._names:
+      tensors.extend(self._tensors[name])
+    return [tpu_ops.outfeed_enqueue_tuple(tensors)]
+
+  def create_tpu_hostcall(self):
+    """Sends the tensors through outfeed and runs the host_fn on CPU.
+
+    The tensors are concatenated along dimension 0 to form a global tensor
+    across all shards. The concatenated function is passed to the host_fn and
+    executed on the first host.
+
+    Returns:
+      A dictionary mapping name to the return type of the host_call by that
+      name.
 
     Raises:
       RuntimeError: If outfeed tensor is scalar.
     """
+    if not self._names:
+      return []
 
-    num_cores = self._ctx.num_cores
-
+    ret = {}
     # For each i, dequeue_ops[i] is a list containing the tensors from all
     # shards. This list is concatenated later.
     dequeue_ops = []
-    for i in xrange(len(self._tensors)):
-      dequeue_ops.append([])
+    tensor_dtypes = []
+    tensor_shapes = []
+    for name in self._names:
+      for _ in self._tensors[name]:
+        dequeue_ops.append([])
+      for dtype in self._tensor_dtypes[name]:
+        tensor_dtypes.append(dtype)
+      for shape in self._tensor_shapes[name]:
+        tensor_shapes.append(shape)
 
-    # Outfeed ops execute on each JF node.
+    # Outfeed ops execute on each JF node. Note: we must constraint it such that
+    # we have at most one outfeed dequeue and enqueue.
     tpu_device_placement_fn = self._ctx.tpu_device_placement_function
-    for i in xrange(num_cores):
+    for i in xrange(self._ctx.num_cores):
       with ops.device(tpu_device_placement_fn(i)):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=self._tensor_dtypes, shapes=self._tensor_shapes)
+            dtypes=tensor_dtypes, shapes=tensor_shapes)
         for j, item in enumerate(outfeed_tensors):
           dequeue_ops[j].append(item)
 
-    # It is assumed evaluation always happends on single host TPU system. So,
+    # Deconstruct dequeue ops.
+    dequeue_ops_by_name = {}
+    pos = 0
+    for name in self._names:
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos+len(self._tensors[name])]
+      pos += len(self._tensors[name])
+
+    # It is assumed evaluation always happens on single host TPU system. So,
     # place all ops on tpu host if possible.
+    #
+    # TODO(jhseu): Evaluate whether this is right for summaries.
     with ops.device(self._ctx.tpu_host_placement_function(core_id=0)):
-      for i, item in enumerate(dequeue_ops):
-        if dequeue_ops[i][0].shape.ndims == 0:
-          raise RuntimeError(
-              'All tensors outfed from TPU should preseve batch size '
-              'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-        # TODO(xiejw): Allow users to specify the axis for batch size dimension.
-        dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+      for name in self._names:
+        dequeue_ops = dequeue_ops_by_name[name]
+        for i, item in enumerate(dequeue_ops):
+          if dequeue_ops[i][0].shape.ndims == 0:
+            raise RuntimeError(
+                'All tensors outfed from TPU should preserve batch size '
+                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
+          # TODO(xiejw): Allow users to specify the axis for batch size
+          # dimension.
+          dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
 
-      if self._is_dict:
-        dequeue_ops = dict(zip(self._tensor_keys, dequeue_ops))
-        try:
-          eval_metric_ops = self._metric_fn(**dequeue_ops)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling metric_fn for evalution: %s. '
-              'It is likely the tensors (eval_metrics[1]) do not match the '
-              'metric_fn arguments', e)
-          raise e
-      else:
-        eval_metric_ops = self._metric_fn(*dequeue_ops)
+        if self._tensor_keys[name] is not None:
+          # The user-provided eval_metrics[1] is a dict.
+          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
+          try:
+            ret[name] = self._host_fns[name](**dequeue_ops)
+          except TypeError as e:
+            logging.warning(
+                'Exception while calling %s: %s. It is likely the tensors '
+                '(%s[1]) do not match the '
+                'function\'s arguments', name, e, name)
+            raise e
+        else:
+          ret[name] = self._host_fns[name](*dequeue_ops)
 
-    eval_update_ops = []
-    for k, v in eval_metric_ops.items():
-      eval_metric_ops[k] = (v[0], dummy_update_op)
-      eval_update_ops.append(v[1])
+    return ret
 
-    return eval_metric_ops, eval_update_ops
+
+def wrap_hostcall_with_global_step(hostcall):
+  """Wrap the hostcall so that we update the global step upon every call."""
+  if hostcall is None:
+    return None
+  host_fn, tensors = hostcall
+
+  def global_step_host_fn(_global_step, *args, **kwargs):  # pylint: disable=invalid-name
+    # Note that we don't have any ordering here, so the graph may see a
+    # global_step that's off by 1.
+    state_ops.assign(
+        training.get_global_step(),
+        math_ops.cast(_global_step[0], dtypes.int64))
+    return host_fn(*args, **kwargs)
+  # Give the global step tensor a batch dimension. Reshape is not supported for
+  # int64, so we cast it to int32.
+  # TODO(jhseu): Remove the cast once int64 is supported.
+  global_step_tensor = array_ops.reshape(
+      math_ops.cast(training.get_global_step(), dtypes.int32), [1])
+  if isinstance(tensors, dict):
+    outfeed_tensors = {'_global_step': global_step_tensor}
+    outfeed_tensors.update(tensors)
+    return global_step_host_fn, outfeed_tensors
+  else:
+    fn_args = util.fn_args(host_fn)
+    if len(tensors) != len(fn_args):
+      raise RuntimeError(
+          'In TPUEstimatorSpec.host_call, length of tensors {} does not match '
+          'method args of the function, which takes {}.'.format(
+              len(tensors), len(fn_args)))
+    return global_step_host_fn, [global_step_tensor] + list(tensors)
+
+
+class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
+  """Hook to run host calls when use_tpu=False."""
+
+  def __init__(self, tensors):
+    self._tensors = tensors
+
+  def begin(self):
+    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
+    # create a separate hook to guarantee execution order, because summaries
+    # need to be initialized before the outfeed thread starts.
+    # TODO(jhseu): Make a wrapper hook instead?
+    self._init_ops = contrib_summary.summary_writer_initializer_op()
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    self._finalize_ops = []
+    for op in self._init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_ops)
+
+  def before_run(self, run_context):
+    return basic_session_run_hooks.SessionRunArgs(self._tensors)
+
+  def end(self, session):
+    session.run(self._finalize_ops)
 
 
 class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
@@ -1387,6 +1743,23 @@ class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
     logging.info('examples/sec: %g', examples_per_sec)
 
 
+class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
+  """Change SIGINT (CTRL^C) handler to force quit the process.
+
+  The default behavior often results in hanging processes.
+  The original handler is restored after training/evaluation.
+  """
+
+  def __init__(self):
+    self._signal_fn = signal.getsignal(signal.SIGINT)
+
+  def before_run(self, run_context):
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+  def end(self, session):
+    signal.signal(signal.SIGINT, self._signal_fn)
+
+
 class TPUEstimator(estimator_lib.Estimator):
   """Estimator with TPU support.
 
@@ -1394,30 +1767,28 @@ class TPUEstimator(estimator_lib.Estimator):
   replicating inputs and models for each core, and returning to host
   periodically to run hooks.
 
-  If `use_tpu` is false, all training, evaluation, and predict are executed on
-  CPU.
-
-  For training, TPUEstimator transforms a global batch size in params to a
-  per-shard batch size when calling the `input_fn` and `model_fn`. Users should
-  specify `train_batch_size` in constructor, and then get the batch size for
-  each shard in `input_fn` and `model_fn` by `params['batch_size']`. If
-  `TPUConfig.per_host_input_for_training` is `True`, `input_fn` is invoked per
-  host rather than per core. In this case, a global batch size is transformed a
-  per-host batch size in params for `input_fn`, but `model_fn` still gets
-  per-core batch size.
-
-  For evaluation, if `eval_batch_size` is None, it is executed on CPU, even if
-  `use_tpu` is `True`. If `eval_batch_size` is not `None`, it is executed on
-  TPU, which is an experimental feature. In this case, `model_fn` should return
-  `TPUEstimatorSpec` instead of `EstimatorSpec`, which expects the
-  `eval_metrics` for TPU evaluation.
+  TPUEstimator transforms a global batch size in params to a per-shard batch
+  size when calling the `input_fn` and `model_fn`. Users should specify
+  global batch size in constructor, and then get the batch size for each shard
+  in `input_fn` and `model_fn` by `params['batch_size']`.
+  For training, `model_fn` gets per-core batch size; `input_fn` may get
+  per-core or per-host batch size depending on
+  `per_host_input_for_training` in `TPUConfig`.
+  For evaluation, `model_fn` gets per-core batch size and `input_fn` get
+  per-host batch size.
 
+  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
+  for TPU evaluation.
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
   `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
   `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
   a dict from metric string name to the result of calling a metric function,
   namely a `(metric_tensor, update_op)` tuple.
 
+  One can set `use_tpu` to `False` for testing. All training, evaluation, and
+  predict will be executed on CPU. `input_fn` and `model_fn` will receive
+  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
+
   Current limitations:
 
   1. TPU evaluation only works on single host.
@@ -1472,6 +1843,7 @@ class TPUEstimator(estimator_lib.Estimator):
                use_tpu=True,
                train_batch_size=None,
                eval_batch_size=None,
+               predict_batch_size=None,
                batch_axis=None):
     """Constructs an `TPUEstimator` instance.
 
@@ -1490,18 +1862,17 @@ class TPUEstimator(estimator_lib.Estimator):
         basic python types. There are reserved keys for `TPUEstimator`,
         including 'batch_size'.
       use_tpu: A bool indicating whether TPU support is enabled. Currently,
-        - TPU training respects this bit.
-        - If true, see `eval_batch_size` for evaluate support.
+        - TPU training and evaluation respect this bit.
         - Predict still happens on CPU.
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
         size, as params['batch_size'], when calling `input_fn` and `model_fn`.
         Cannot be `None` if `use_tpu` is `True`. Must be divisible by
         `config.tpu_config.num_shards`.
-      eval_batch_size: An int representing the global training batch size.
-        Currently, if `None`, evaluation is still executed on CPU (even when
-        `use_tpu` is True). In near future, `use_tpu` will be the only option to
-        switch between TPU/CPU evaluation.
+      eval_batch_size: An int representing evaluation batch size.
+        Must be divisible by `config.tpu_config.num_shards`.
+      predict_batch_size: An int representing the prediction batch size.
+        Must be divisible by `config.tpu_config.num_shards`.
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards. For example, if your input_fn produced (images, labels)
@@ -1541,15 +1912,25 @@ class TPUEstimator(estimator_lib.Estimator):
             .format(train_batch_size, config.tpu_config.num_shards))
 
       if eval_batch_size is not None:
-        if config.tpu_config.num_shards > 8:
-          raise NotImplementedError(
-              'TPU evaluation is only supported with one host.')
-
+        if not isinstance(eval_batch_size, int):
+          raise ValueError('`eval_batch_size` must be an int')
+        if eval_batch_size < 1:
+          raise ValueError('`eval_batch_size` must be positive')
         if eval_batch_size % config.tpu_config.num_shards != 0:
           raise ValueError(
               'eval batch size {} must be divisible by number of shards {}'
               .format(eval_batch_size, config.tpu_config.num_shards))
 
+      if predict_batch_size is not None:
+        if not isinstance(predict_batch_size, int):
+          raise ValueError('`predict_batch_size` must be an int')
+        if predict_batch_size < 1:
+          raise ValueError('`predict_batch_size` must be positive')
+        if predict_batch_size % config.tpu_config.num_shards != 0:
+          raise ValueError(
+              'predict batch size {} must be divisible by number of shards {}'
+              .format(predict_batch_size, config.tpu_config.num_shards))
+
     # Verifies the model_fn signature according to Estimator framework.
     estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
     # We cannot store config and params in this constructor as parent
@@ -1569,7 +1950,7 @@ class TPUEstimator(estimator_lib.Estimator):
 
     # All properties passed to _TPUContext are immutable.
     self._ctx = _TPUContext(self._config, train_batch_size, eval_batch_size,
-                            use_tpu)
+                            predict_batch_size, use_tpu)
 
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
@@ -1617,6 +1998,14 @@ class TPUEstimator(estimator_lib.Estimator):
 
     util_lib.check_positive_integer(steps, 'Eval steps')
 
+    if self._config.tpu_config.num_shards > 8:
+      raise NotImplementedError(
+          'TPU evaluation is only supported with one host.')
+
+    if self._ctx._eval_batch_size is None:  # pylint: disable=protected-access
+      raise ValueError('`eval_batch_size` cannot be `None`'
+                       'if evaluate() is called on TPU.')
+
     return [
         evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
             num_evals=steps),
@@ -1657,7 +2046,9 @@ class TPUEstimator(estimator_lib.Estimator):
       if batch_size_for_input_fn is not None:
         kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
 
-      if ctx.is_running_on_cpu():
+      # For export_savedmodel, input_fn is never passed to Estimator. So,
+      # `is_export_mode` must be False.
+      if ctx.is_running_on_cpu(is_export_mode=False):
         with ops.device('/device:CPU:0'):
           return input_fn(**kwargs)
 
@@ -1684,8 +2075,13 @@ class TPUEstimator(estimator_lib.Estimator):
       with self._ctx.with_mode(mode) as ctx:
         model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
 
-        # TODO(jhseu): Move to PREDICT to TPU.
-        if ctx.is_running_on_cpu():
+        # For export_savedmodel, input_fn is never passed to Estimator. So,
+        # if features is callable, it means it is the input_fn passed by
+        # TPUEstimator._call_input_fn. Then we can know if the mode == PREDICT,
+        # it implies, it is the .predict API, not export_savedmodel API.
+        is_export_mode = not callable(features)
+
+        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
           logging.info('Running %s on CPU', mode)
           return model_fn_wrapper.call_without_tpu(features, labels)
 
@@ -1695,22 +2091,31 @@ class TPUEstimator(estimator_lib.Estimator):
         input_fn = features
 
         input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn = (
+        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss, scaffold = (
+          loss, host_call, scaffold = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          host_ops = host_call.create_tpu_hostcall()
+          if host_ops is None:
+            host_ops = []
           hooks = [
-              TPUInfeedOutfeedSessionHook(ctx, enqueue_ops),
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  host_ops,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator)),
               ExamplesPerSecondHook(ctx.global_batch_size),
+              InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
                       'loss': array_ops.identity(loss),
                       'step': training.get_global_step()
                   },
                   every_n_secs=30)
-          ]
+          ] + input_hooks
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -1725,40 +2130,114 @@ class TPUEstimator(estimator_lib.Estimator):
               train_op=control_flow_ops.group(*update_ops),
               scaffold=scaffold)
 
-        # Now eval.
-        total_loss, eval_metric_ops, scaffold = _eval_on_tpu_system(
+        if mode == model_fn_lib.ModeKeys.EVAL:
+          total_loss, host_calls, scaffold = _eval_on_tpu_system(
+              ctx, model_fn_wrapper, dequeue_fn)
+          iterations_per_loop_var = _create_or_get_iterations_per_loop()
+          mean_loss = math_ops.div(total_loss,
+                                   math_ops.cast(
+                                       iterations_per_loop_var,
+                                       dtype=total_loss.dtype))
+
+          # Creates a dummy metric update_op for all metrics. Estimator expects
+          # all metrics in eval_metric_ops have update_op and calls them one by
+          # one. The real metric update_ops are invoked in a separated thread.
+          # So, here give Estimator the dummy op for all metrics.
+          with ops.control_dependencies([mean_loss]):
+            # After TPU evaluation computation is done (the mean_loss tensor),
+            # reads all variables back from TPU and updates the eval step
+            # counter properly
+            internal_ops_to_run = _sync_variables_ops()
+            internal_ops_to_run.append(
+                _increase_eval_step_op(iterations_per_loop_var))
+            with ops.control_dependencies(internal_ops_to_run):
+              dummy_update_op = control_flow_ops.no_op()
+
+          host_call_ret = host_calls.create_tpu_hostcall()
+          eval_metric_ops = {}
+          eval_update_ops = []
+          for k, v in host_call_ret['eval_metrics'].items():
+            eval_metric_ops[k] = (v[0], dummy_update_op)
+            eval_update_ops.append(v[1])
+
+          if 'host_call' not in host_call_ret:
+            host_ops = []
+          else:
+            host_ops = host_call_ret['host_call']
+          hooks = [
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  eval_update_ops + host_ops,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator)),
+          ] + input_hooks
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=mean_loss,
+              evaluation_hooks=hooks,
+              eval_metric_ops=eval_metric_ops,
+              scaffold=scaffold)
+
+        # Predict
+        assert mode == model_fn_lib.ModeKeys.PREDICT
+
+        dummy_predict_op, host_calls, scaffold = _predict_on_tpu_system(
             ctx, model_fn_wrapper, dequeue_fn)
-        iterations_per_loop_var = _create_or_get_iterations_per_loop()
-        mean_loss = math_ops.div(total_loss,
-                                 math_ops.cast(
-                                     iterations_per_loop_var,
-                                     dtype=total_loss.dtype))
-
-        # Creates a dummy metric update_op for all metrics. Estimator expects
-        # all metrics in eval_metric_ops have update_op and calls them one by
-        # one. The real metric update_ops are invoked in a separated thread. So,
-        # here give Estimator the dummy op for all metrics.
-        with ops.control_dependencies([mean_loss]):
-          # After TPU evaluation computation is done (the mean_loss tensor),
-          # reads all variables back from TPU and updates the eval step counter
-          # properly
+        with ops.control_dependencies([dummy_predict_op]):
           internal_ops_to_run = _sync_variables_ops()
-          internal_ops_to_run.append(
-              _increase_eval_step_op(iterations_per_loop_var))
           with ops.control_dependencies(internal_ops_to_run):
-            dummy_update_op = control_flow_ops.no_op()
+            dummy_predict_op = control_flow_ops.no_op()
+
+        # In train and evaluation, the main TPU program is passed to monitored
+        # training session to run. Infeed enqueue and outfeed dequeue are
+        # executed in side threads. This is not the configuration for
+        # prediction mode.
+        #
+        # For prediction, the Estimator executes the EstimatorSpec.predictions
+        # directly and yield the element (via generator) to call site. So, the
+        # outfeed based prediction must be passed to MonitoredSession directly.
+        # Other parts of the TPU execution are organized as follows.
+        #
+        # 1. All outfeed based Tensors must be grouped with predictions Tensors
+        #    to form a single invocation. This avoid the issue we might trigger
+        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
+        #    placed in control_dependencies of `stopping_signals`, and
+        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
+        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
+        #    all SessionRunArgs with the fetch in session.run together.
+        #
+        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
+        #    are grouped together. They will be launched once and only once in
+        #    side threads and they quit naturally according to the SAME stopping
+        #    condition.
+        enqueue_ops.append(dummy_predict_op)
+
+        host_call_ret = host_calls.create_tpu_hostcall()
+        if 'host_call' not in host_call_ret:
+          host_ops = []
+        else:
+          host_ops = host_call_ret['host_call']
+
+        predictions = host_call_ret['predictions']
+        stopping_signals = host_call_ret['signals']
+
+        with ops.control_dependencies(host_ops):
+          host_ops = []  # Empty, we do do not need it anymore.
+          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
+              stopping_signals)
 
-        eval_metric_ops, eval_update_ops = (
-            eval_metric_ops.to_metric_metric_ops_for_tpu(dummy_update_op))
         hooks = [
-            TPUInfeedOutfeedSessionHook(ctx, enqueue_ops, eval_update_ops),
-        ]
+            _StoppingPredictHook(scalar_stopping_signal),
+            TPUInfeedOutfeedSessionHookForPrediction(ctx, enqueue_ops,
+                                                     host_ops),
+        ] + input_hooks
 
         return model_fn_lib.EstimatorSpec(
             mode,
-            loss=mean_loss,
-            evaluation_hooks=hooks,
-            eval_metric_ops=eval_metric_ops,
+            prediction_hooks=hooks,
+            predictions=predictions,
             scaffold=scaffold)
 
     return _model_fn
@@ -1769,7 +2248,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_eval_step, eval_metric_ops, captured_scaffold_fn = (
+  single_tpu_eval_step, host_calls, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
 
   def multi_tpu_eval_steps_on_single_shard():
@@ -1785,7 +2264,7 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, eval_metric_ops, scaffold
+  return loss, host_calls, scaffold
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
@@ -1793,7 +2272,7 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_train_step, captured_scaffold_fn = (
+  single_tpu_train_step, host_call, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
@@ -1809,7 +2288,35 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, scaffold
+  return loss, host_call, scaffold
+
+
+def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  num_cores = ctx.num_cores
+
+  single_tpu_predict_step, host_calls, captured_scaffold_fn = (
+      model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn))
+
+  def multi_tpu_predict_steps_on_single_shard():
+
+    def cond(scalar_stopping_signal):
+      return math_ops.logical_not(
+          _StopSignals.should_stop(scalar_stopping_signal))
+
+    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
+    outputs = training_loop.while_loop(
+        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
+    return outputs
+
+  (dummy_predict_op,) = tpu.shard(
+      multi_tpu_predict_steps_on_single_shard,
+      inputs=[],
+      num_shards=num_cores,
+      outputs_from_all_shards=False)
+
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return dummy_predict_op, host_calls, scaffold
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
@@ -1830,6 +2337,29 @@ def _wrap_computation_in_while_loop(device, op_fn):
         parallel_iterations=1)
 
 
+def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def cond(scalar_stopping_signal):
+    return math_ops.logical_not(
+        _StopSignals.should_stop(scalar_stopping_signal))
+
+  def computation(unused_scalar_stopping_signal):
+    return_value = op_fn()
+    execute_ops = return_value['ops']
+    signals = return_value['signals']
+    with ops.control_dependencies(execute_ops):
+      return _StopSignals.as_scalar_stopping_signal(signals)
+
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    return control_flow_ops.while_loop(
+        cond,
+        computation, [_StopSignals.NON_STOPPING_SIGNAL],
+        parallel_iterations=1)
+
+
 def _validate_tpu_training_graph():
   """Validate graph before running distributed training.
 
@@ -1920,3 +2450,194 @@ class _CapturingContext(control_flow_ops.ControlFlowContext):
 
   def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
     self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
+class _Inputs(object):
+  """A data structure representing the input_fn returned values.
+
+  This also supports the returned value from input_fn as `Dataset`.
+  """
+
+  def __init__(self, features=None, labels=None, dataset=None, signals=None):
+    if dataset is not None and (features is not None or labels is not None or
+                                signals is not None):
+      raise RuntimeError('Internal Error: Either (features and labels) or '
+                         'dataset should be provided, not both. Please file '
+                         'bug')
+
+    self._features = features
+    self._labels = labels
+    self._signals = signals
+
+    self._dataset = dataset
+    self._iterator = None
+
+  @staticmethod
+  def from_input_fn(return_values):
+    """Returns an `_Inputs` instance according to `input_fn` return value."""
+    if isinstance(return_values, dataset_ops.Dataset):
+      dataset = return_values
+      return _Inputs(dataset=dataset)
+
+    features, labels = _Inputs._parse_inputs(return_values)
+    return _Inputs(features, labels)
+
+  @staticmethod
+  def _parse_inputs(return_values):
+    if isinstance(return_values, tuple):
+      features, labels = return_values
+    else:
+      features, labels = return_values, None
+    return features, labels
+
+  @property
+  def is_dataset(self):
+    """Returns True if the return value from input_fn is Dataset."""
+    return self._dataset is not None
+
+  def dataset_initializer_hook(self):
+    """Returns a `SessionRunHook` to initialize this dataset.
+
+    This must be called before `features_and_labels`.
+    """
+    iterator = self._dataset.make_initializable_iterator()
+    # pylint: disable=protected-access
+    hook = estimator_lib._DatasetInitializerHook(iterator)
+    self._iterator = iterator
+    return hook
+
+  def features_and_labels(self):
+    """Gets `features` and `labels`."""
+    if self.is_dataset:
+      return _Inputs._parse_inputs(self._iterator.get_next())
+
+    return (self._features, self._labels)
+
+  def signals(self):
+    return self._signals
+
+  @property
+  def dataset(self):
+    return self._dataset
+
+
+# TODO(xiejw): Extend this to support final partial batch.
+class _InputsWithStoppingSignals(_Inputs):
+  """Inputs with `_StopSignals` inserted into the dataset."""
+
+  def __init__(self, dataset, batch_size):
+
+    assert dataset is not None
+
+    user_provided_dataset = dataset.map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=False, batch_size=batch_size))
+    final_batch_dataset = dataset.take(1).map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=True, batch_size=batch_size))
+    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
+
+    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
+    self._current_inputs = None
+
+  def features_and_labels(self):
+    if self._current_inputs is not None:
+      raise RuntimeError(
+          'Internal Error: The previous inputs have not been properly '
+          'consumed. First call features_and_labels, then call signals.')
+
+    inputs_with_signals = self._iterator.get_next()
+    features = inputs_with_signals['features']
+    labels = inputs_with_signals.get('labels')
+
+    self._current_inputs = inputs_with_signals
+    return features, labels
+
+  def signals(self):
+    """Returns the `Signals` from `_Inputs`."""
+    if self._current_inputs is None:
+      raise RuntimeError(
+          'Internal Error: The current inputs have not been properly '
+          'generated. First call features_and_labels, then call signals.')
+    signals = self._current_inputs['signals']
+    self._current_inputs = None
+    return signals
+
+  @staticmethod
+  def insert_stopping_signal(stop, batch_size):
+    """Inserts stopping_signal into dataset via _map_fn.
+
+    Here we change the data structure in the dataset, such that the return value
+    is a dictionary now and `features`, `labels`, and `signals` are three
+    distinguished keys in that dict. This provides a better structure, which
+    eases the process to decompose the inputs (see `features_and_labels`).
+
+    Args:
+      stop: bool, state of current stopping signals.
+      batch_size: int, batch size.
+
+    Returns:
+      A map_fn passed to dataset.map API.
+    """
+
+    def _map_fn(*args):
+      features, labels = _Inputs._parse_inputs(args)
+      new_input_dict = {}
+      new_input_dict['features'] = features
+      if labels is not None:
+        new_input_dict['labels'] = labels
+      new_input_dict['signals'] = _StopSignals(
+          stop=stop, batch_size=batch_size).as_dict()
+      return new_input_dict
+
+    return _map_fn
+
+
+class _StopSignals(object):
+  """Signals class holding all logic to handle TPU stopping condition."""
+
+  NON_STOPPING_SIGNAL = 0.0
+  STOPPING_SIGNAL = 1.0
+
+  def __init__(self, stop, batch_size):
+    self._stop = stop
+    self._batch_size = batch_size
+
+  def as_dict(self):
+    shape = [self._batch_size, 1]
+    dtype = dtypes.float32
+
+    if self._stop:
+      stopping = array_ops.ones(shape=shape, dtype=dtype)
+    else:
+      stopping = array_ops.zeros(shape=shape, dtype=dtype)
+
+    return {'stopping': stopping}
+
+  @staticmethod
+  def as_scalar_stopping_signal(signals):
+    return array_ops.identity(signals['stopping'][0][0])
+
+  @staticmethod
+  def should_stop(scalar_stopping_signal):
+    return scalar_stopping_signal >= _StopSignals.STOPPING_SIGNAL
+
+
+class _SignalsHelper(object):
+  """A general helper class to handle common signals manipulation."""
+
+  def __init__(self, signals):
+    self._signal_keys = []
+    for key in sorted(signals.iterkeys()):
+      self._signal_keys.append(key)
+
+  @property
+  def num_signals(self):
+    return len(self._signal_keys)
+
+  def unflatten(self, tensor_list):
+    return dict(zip(self._signal_keys, tensor_list))
+
+  @staticmethod
+  def as_tensor_list(signals):
+    return [signals[key] for key in sorted(signals.iterkeys())]
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index cccaa2b833e..6db373d2d5e 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,6 +26,7 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
+        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -285,6 +286,28 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tensor_queue_dataset_test",
+    size = "large",
+    srcs = ["python/training/tensor_queue_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":training_py",
+        "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
new file mode 100644
index 00000000000..409aba817c1
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
@@ -0,0 +1,200 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import nest as tf_nest
+
+
+class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.Dataset):
+  """A `Dataset` that prepends a queue to another `Dataset`.
+
+  A vector of handles to the queue is returned as the first component of
+  the associated iterator.  This vector can be passed to
+  `enqueue_in_queue_dataset` to add new elements to the queue.
+  """
+
+  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
+    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
+    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__()
+    if sparse.any_sparse(input_dataset.output_classes):
+      raise TypeError(
+          "Batching of padded sparse tensors is not currently supported")
+    self._input_dataset = input_dataset
+    self._batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+    # pylint: disable=protected-access
+    if padded_shapes is None:
+      self._padded_shapes = nest.map_structure(
+          dataset_ops._partial_shape_to_tensor, input_dataset.output_shapes)
+    else:
+      self._padded_shapes = nest.map_structure_up_to(
+          input_dataset.output_shapes, dataset_ops._partial_shape_to_tensor,
+          padded_shapes)
+    padding_values = (
+        padding_values if padding_values is not None else
+        dataset_ops._default_padding(input_dataset))
+    self._padding_values = nest.map_structure_up_to(
+        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
+        padding_values, input_dataset.output_types)
+    # pylint: enable=protected-access
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        batch_size=self._batch_size,
+        padded_shapes=[
+            ops.convert_to_tensor(s, dtype=dtypes.int64)
+            for s in nest.flatten(self._padded_shapes)
+        ],
+        padding_values=nest.flatten(self._padding_values),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return (ops.Tensor, self._input_dataset.output_classes)
+
+  def _as_batch_shape(self, shape_like):
+    return tensor_shape.vector(None).concatenate(
+        tensor_util.constant_value_as_shape(shape_like))
+
+  @property
+  def output_shapes(self):
+    # First output is a variant representing the Queue
+    return (tensor_shape.vector(None),
+            nest.map_structure(self._as_batch_shape, self._padded_shapes))
+
+  @property
+  def output_types(self):
+    # First output is a variant representing the Queue
+    return (dtypes.variant, self._input_dataset.output_types)
+
+
+def prepend_from_queue_and_padded_batch_dataset(batch_size,
+                                                padding_values=None,
+                                                padded_shapes=None):
+  """A transformation that prepends a queue to a `Dataset` and batches results.
+
+  A vector of handles to the queue is returned as the first component of the
+  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
+  to add new elements to the queue.
+
+  Below is an example of how this dataset might be used to split incoming
+  variable-length sequences into "head" and "rest" parts, where "rest" parts
+  are re-enqueued back into the dataset.  A more realistic example would
+  perform some calculation on the "head" and modify some components of "rest"
+  with the result (before re-enqueueing).
+
+  ```python
+  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
+  # Make a dataset of variable-length vectors and their lengths.
+  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
+  # Emit a queue we can prepend to, and counts/values as padded batch.
+  dataset = dataset.apply(
+      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
+        batch_size=10))
+  dataset = dataset.prefetch(1)
+
+  iterator = dataset.make_one_shot_iterator()
+  queue, (count, padded_value) = iterator.get_next()
+
+  # Split the padded_value into two pieces: head and rest
+  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
+  bound = tf.minimum(3, tf.reduce_max(count))
+  value_head = padded_value[:, :bound]
+  count_rest = tf.gather(count - 3, rest_indices)
+  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
+  queue_rest = tf.gather(queue, rest_indices)
+  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
+    queue_rest, (count_rest, value_rest))
+  with tf.control_dependencies([enqueue_rest_op]):
+    calculation = fn(value_head)
+
+  while True:  # Will raise OutOfRange when finished with all pieces.
+    session.run(calculation)
+  ```
+
+  Args:
+    batch_size: `int64` scalar tensor.  The batch size to use when performing
+      padded batching.
+    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
+      the structure and dtypes of padding_values should match that of
+      incoming dataset's `output_types`.
+    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
+      If provided, the structure must match that of the incoming dataset's
+      `output_types`.  If not provided, the incoming dataset's `output_shapes`
+      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
+      treated as being unique per-batch: for each batch time, an unknown
+      dimension is replaced with the maximum given value of this dimension
+      across all tensors for the given component in the batch.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _PrependFromQueueAndPaddedBatchDataset(
+        dataset,
+        batch_size=batch_size,
+        padding_values=padding_values,
+        padded_shapes=padded_shapes)
+
+  return _apply_fn
+
+
+def enqueue_in_queue_dataset(queue, components):
+  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
+
+  The components' dtypes and shapes must be compatible with the `output_shapes`
+  attribute of the `dataset` created by
+  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
+  non-batched and batched modes.
+
+  For more details, see the example in the docstring for
+  `prepend_from_queue_and_padded_batch_dataset`.
+
+  Args:
+    queue: `variant` scalar or vector tensor.
+      The tensor emitted by the first component of the iterator associated with
+      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
+      then the `components` input tensors should not have a prepended batch
+      dimension.
+    components: Nested tuple of tensors, each with a leading batch dimension
+      if `queue` is a vector.  The structure, dtypes, and shapes
+      (excluding batch dimension) must match the nested tuples
+      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
+      output types and shapes) of the `dataset` emitted by
+      the original `prepend_from_queue_and_padded_batch_dataset` call.
+
+  Returns:
+    An `Operation` that enqueues `components` into the dataset(s) associated
+    with entries of `queue`.
+  """
+  return gen_dataset_ops.enqueue_in_queue_dataset(
+      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
new file mode 100644
index 00000000000..0338f409a20
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
@@ -0,0 +1,355 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TensorQueueDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
+
+  def testNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
+    self.assertAllEqual(([None],) * 2,
+                        [x.as_list() for x in dataset.output_shapes])
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertEqual([0], self.evaluate(value))
+    self.assertEqual([1], self.evaluate(value))
+    self.assertEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([0, 1], self.evaluate(value))
+    self.assertAllEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=2, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
+    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0, 0]], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[1, 0, 0]], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[-1, 0, 0]], value_2)
+      value_3 = sess.run(value)
+      self.assertAllEqual([[1, 0, 0]], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[2, 0, 0]], value_4)
+      value_5 = sess.run(value)
+      self.assertAllEqual([[-2, 0, 0]], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertEqual([0], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([-1], value_2)
+      value_3 = sess.run(value)
+      self.assertEqual([1], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([2], value_4)
+      value_5 = sess.run(value)
+      self.assertEqual([-2], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testBatchedOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
+                                                  array_ops.expand_dims(
+                                                      value[0], axis=0))
+    with self.test_session() as sess:
+      value_0, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 1], value_0)
+      value_1, _ = sess.run([value, enqueue_zeroth])
+      self.assertAllEqual([0, -1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 2], value_2)
+      self.assertAllEqual([0, -2], sess.run(value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testManyEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_many_more = [
+        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
+        for i in range(1000)
+    ]
+    with self.test_session() as sess:
+      value_0, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual([0], value_0)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
+      # Going back to the original input.
+      value_1, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual(1, value_1)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testEnqueueWithPrefetch(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    # Prefetching will request additional values before they are
+    # available to the queue.
+    dataset = dataset.prefetch(buffer_size=3)
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
+    with self.test_session() as sess:
+      i = 0
+      while i < 4:
+        received, _ = sess.run((value, enqueue))
+        if received.size > 0:
+          self.assertAllEqual([i], received)
+          i += 1
+      received_last = False
+      while True:
+        try:
+          received = sess.run(value)
+          if received.size > 0:
+            self.assertAllEqual([4], received)
+            received_last = True
+        except errors.OutOfRangeError:
+          break
+      self.assertTrue(received_last)
+
+  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[2]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          r"Incompatible input shapes at component 0 between "
+          r"input dataset this dataset: \[3\] vs. \[2\]"):
+        sess.run(value)
+
+  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+
+    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
+        queue_handle, (value, value))
+    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [1.0],
+                                                         dtype=np.float32))
+    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
+        queue_handle, ([1],))
+    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [[1]], dtype=np.int32))
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          "mismatched number of tensors.  Queue expects 1 tensors but "
+          "tried to insert 2"):
+        sess.run(enqueue_bad_structure)
+      with self.assertRaisesOpError(r"Expected component 0 to have batched "
+                                    r"shape \[1,...\], but saw shape: \[\]"):
+        sess.run(enqueue_bad_shape_no_batch_dim)
+      with self.assertRaisesOpError(
+          r"mismatched shapes at component 0.  Attempted to insert tensor "
+          r"with shape \[1\] but queue expected shape: \[\]"):
+        sess.run(enqueue_bad_shape)
+      with self.assertRaisesOpError(
+          r"mismatched dtypes at component 0.  Attempted to insert tensor "
+          r"of type float but queue expected type: int32"):
+        sess.run(enqueue_bad_dtype)
+
+  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    with self.assertRaisesRegexp(
+        TypeError, r"Unable to create padding for field of type 'variant'"):
+      dataset.padded_batch(batch_size=10, padded_shapes=[1])
+
+  def testOneEnqueueWithPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
+
+    iterator = dataset.make_one_shot_iterator()
+    queue, (count, padded_value) = iterator.get_next()
+
+    # Split the padded_value into two pieces: head and rest
+    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
+    bound = math_ops.minimum(2, math_ops.reduce_max(count))
+    value_head = padded_value[:, :bound]
+    count_rest = array_ops.gather(count - 2, rest_indices)
+    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
+    queue_rest = array_ops.gather(queue, rest_indices)
+    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
+                                                   (count_rest, value_rest))
+    with ops.control_dependencies([enqueue_rest_op]):
+      calc = array_ops.identity(value_head)
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
+      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      # Get some final batches due to prefetching.
+      for _ in range(3):
+        try:
+          self.assertAllEqual(
+              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
+        except errors.OutOfRangeError as e:
+          self.assertTrue(str(e).startswith("End of sequence"))
+
+  def testNonstandardPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=3, padding_values=(
+                0,
+                -1,
+            )))
+
+    iterator = dataset.make_one_shot_iterator()
+    _, (unused_count, padded_value) = iterator.get_next()
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
+                          sess.run(padded_value))
+      self.assertAllEqual([[6] * 6], sess.run(padded_value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(padded_value)
+
+
+# TODO(ebrevdo): Figure out how to use run_core_tests to test state
+# saving of an iterator that's had some tensors enqueued into its queue.
+class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testPrependFromQueueAndPaddedBatch(self):
+
+    def build_dataset(seq_lens):
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          lambda x: array_ops.fill([x], x)).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
+
+    def build_dataset(seq_lens):
+
+      def fill_tuple(x):
+        filled = array_ops.fill([x], x)
+        return (filled, string_ops.as_string(filled))
+
+      padded_shape = [-1]
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          fill_tuple).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(
+                  batch_size=4,
+                  padded_shapes=(padded_shape, padded_shape),
+                  padding_values=(-1, "<end>")))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
index 2992a61ea81..9675428e56e 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
@@ -142,9 +142,9 @@ Status ConvertConstantsToImmutable(const string& in_graph_filename,
   const auto load_graph_status =
       ReadBinaryProto(default_env, in_graph_filename, &graph_def);
   if (!load_graph_status.ok()) {
-    return tensorflow::errors::NotFound("Failed to load graph at '",
-                                        in_graph_filename, "' : ",
-                                        load_graph_status.error_message());
+    return tensorflow::errors::NotFound(
+        "Failed to load graph at '", in_graph_filename,
+        "' : ", load_graph_status.error_message());
   }
 
   NodeConverter node_converter;
diff --git a/tensorflow/contrib/util/inspect_checkpoint.cc b/tensorflow/contrib/util/inspect_checkpoint.cc
index 39088aeaad6..9b578ceb075 100644
--- a/tensorflow/contrib/util/inspect_checkpoint.cc
+++ b/tensorflow/contrib/util/inspect_checkpoint.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 47ed83f521c..1a0b5028feb 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -49,8 +49,8 @@ VerbsServer::~VerbsServer() {
 Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
                                         GrpcChannelCache** channel_cache) {
   string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:",
-                      server_def.task_index());
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
+                      "/task:", server_def.task_index());
 
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 29c515121e7..4ba84b420f2 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -381,7 +381,6 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":abi",
         ":lib",
         ":lib_platform",
     ],
@@ -434,6 +433,7 @@ tf_cuda_library(
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
         "framework/dataset.h",
+        "framework/dataset_stateful_op_whitelist.h",
         "framework/device_base.h",
         "framework/function.h",
         "framework/graph_def_util.h",
@@ -611,6 +611,7 @@ tf_gen_op_libs(
         "list_ops",
         "lookup_ops",
         "logging_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
@@ -693,6 +694,7 @@ cc_library(
         ":list_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
+        ":manip_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
         ":no_op_op_lib",
@@ -830,6 +832,7 @@ cc_library(
         "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
+        "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
         "//tensorflow/core/kernels:nn",
@@ -1152,6 +1155,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
+        "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
@@ -1352,6 +1356,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "protobuf/device_properties_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "protobuf/device_properties.proto",
+    visibility = ["//visibility:public"],
+)
+
 # -----------------------------------------------------------------------------
 # Internal targets
 
@@ -1899,6 +1910,7 @@ cc_library(
 tf_cuda_library(
     name = "cuda_device_functions",
     hdrs = ["util/cuda_device_functions.h"],
+    cuda_deps = ["//third_party_gpus/cuda:cuda_headers"],
     visibility = ["//visibility:public"],
     deps = [":framework_lite"],
 )
@@ -2413,7 +2425,6 @@ cc_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":stacktrace_handler",
         ":test",  # buildcleaner: keep
         "//tensorflow/core/platform/default/build_config:test_main",
     ],
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 81187ff6b77..58dbac4e8ed 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -96,6 +96,7 @@ tf_cc_test(
     srcs = ["api_test.cc"],
     data = [
         ":base_api_def",
+        ":python_api_def",
     ],
     deps = [
         ":excluded_ops_lib",
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 112c55ccc3b..477a0b670e4 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -41,8 +41,9 @@ namespace tensorflow {
 namespace {
 constexpr char kDefaultApiDefDir[] =
     "tensorflow/core/api_def/base_api";
+constexpr char kPythonApiDefDir[] =
+    "tensorflow/core/api_def/python_api";
 constexpr char kApiDefFilePattern[] = "api_def_*.pbtxt";
-}  // namespace
 
 // Reads golden ApiDef files and returns a map from file name to ApiDef file
 // contents.
@@ -66,9 +67,93 @@ void GetGoldenApiDefs(Env* env, const string& api_files_dir,
   }
 }
 
-class ApiTest : public ::testing::Test {
+void TestAllApiDefsHaveCorrespondingOp(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  std::unordered_set<string> op_names;
+  for (const auto& op : ops.op()) {
+    op_names.insert(op.name());
+  }
+  for (const auto& name_and_api_def : api_defs_map) {
+    ASSERT_TRUE(op_names.find(name_and_api_def.first) != op_names.end())
+        << name_and_api_def.first << " op has ApiDef but missing from ops. "
+        << "Does api_def_" << name_and_api_def.first << " need to be deleted?";
+  }
+}
+
+void TestAllApiDefInputArgsAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_arg : api_def.in_arg()) {
+      bool found_arg = false;
+      for (const auto& op_arg : op.input_arg()) {
+        if (api_def_arg.name() == op_arg.name()) {
+          found_arg = true;
+          break;
+        }
+      }
+      ASSERT_TRUE(found_arg)
+          << "Input argument " << api_def_arg.name()
+          << " (overwritten in api_def_" << op.name()
+          << ".pbtxt) is not defined in OpDef for " << op.name();
+    }
+  }
+}
+
+void TestAllApiDefOutputArgsAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_arg : api_def.out_arg()) {
+      bool found_arg = false;
+      for (const auto& op_arg : op.output_arg()) {
+        if (api_def_arg.name() == op_arg.name()) {
+          found_arg = true;
+          break;
+        }
+      }
+      ASSERT_TRUE(found_arg)
+          << "Output argument " << api_def_arg.name()
+          << " (overwritten in api_def_" << op.name()
+          << ".pbtxt) is not defined in OpDef for " << op.name();
+    }
+  }
+}
+
+void TestAllApiDefAttributeNamesAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_attr : api_def.attr()) {
+      bool found_attr = false;
+      for (const auto& op_attr : op.attr()) {
+        if (api_def_attr.name() == op_attr.name()) {
+          found_attr = true;
+        }
+      }
+      ASSERT_TRUE(found_attr)
+          << "Attribute " << api_def_attr.name() << " (overwritten in api_def_"
+          << op.name() << ".pbtxt) is not defined in OpDef for " << op.name();
+    }
+  }
+}
+}  // namespace
+
+class BaseApiTest : public ::testing::Test {
  protected:
-  ApiTest() {
+  BaseApiTest() {
     OpRegistry::Global()->Export(false, &ops_);
     const std::vector<string> multi_line_fields = {"description"};
 
@@ -80,7 +165,7 @@ class ApiTest : public ::testing::Test {
 };
 
 // Check that all ops have an ApiDef.
-TEST_F(ApiTest, AllOpsAreInApiDef) {
+TEST_F(BaseApiTest, AllOpsAreInApiDef) {
   auto* excluded_ops = GetExcludedOps();
   for (const auto& op : ops_.op()) {
     if (excluded_ops->find(op.name()) != excluded_ops->end()) {
@@ -94,16 +179,8 @@ TEST_F(ApiTest, AllOpsAreInApiDef) {
 }
 
 // Check that ApiDefs have a corresponding op.
-TEST_F(ApiTest, AllApiDefsHaveCorrespondingOp) {
-  std::unordered_set<string> op_names;
-  for (const auto& op : ops_.op()) {
-    op_names.insert(op.name());
-  }
-  for (const auto& name_and_api_def : api_defs_map_) {
-    ASSERT_TRUE(op_names.find(name_and_api_def.first) != op_names.end())
-        << name_and_api_def.first << " op has ApiDef but missing from ops. "
-        << "Does api_def_" << name_and_api_def.first << " need to be deleted?";
-  }
+TEST_F(BaseApiTest, AllApiDefsHaveCorrespondingOp) {
+  TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
 }
 
 string GetOpDefHasDocStringError(const string& op_name) {
@@ -117,7 +194,7 @@ string GetOpDefHasDocStringError(const string& op_name) {
 
 // Check that OpDef's do not have descriptions and summaries.
 // Descriptions and summaries must be in corresponding ApiDefs.
-TEST_F(ApiTest, OpDefsShouldNotHaveDocs) {
+TEST_F(BaseApiTest, OpDefsShouldNotHaveDocs) {
   auto* excluded_ops = GetExcludedOps();
   for (const auto& op : ops_.op()) {
     if (excluded_ops->find(op.name()) != excluded_ops->end()) {
@@ -143,62 +220,56 @@ TEST_F(ApiTest, OpDefsShouldNotHaveDocs) {
 
 // Checks that input arg names in an ApiDef match input
 // arg names in corresponding OpDef.
-TEST_F(ApiTest, AllApiDefInputArgsAreValid) {
-  for (const auto& op : ops_.op()) {
-    const auto& api_def = api_defs_map_[op.name()];
-    for (const auto& api_def_arg : api_def.in_arg()) {
-      bool found_arg = false;
-      for (const auto& op_arg : op.input_arg()) {
-        if (api_def_arg.name() == op_arg.name()) {
-          found_arg = true;
-          break;
-        }
-      }
-      ASSERT_TRUE(found_arg)
-          << "Input argument " << api_def_arg.name()
-          << " (overwritten in api_def_" << op.name()
-          << ".pbtxt) is not defined in OpDef for " << op.name();
-    }
-  }
+TEST_F(BaseApiTest, AllApiDefInputArgsAreValid) {
+  TestAllApiDefInputArgsAreValid(ops_, api_defs_map_);
 }
 
 // Checks that output arg names in an ApiDef match output
 // arg names in corresponding OpDef.
-TEST_F(ApiTest, AllApiDefOutputArgsAreValid) {
-  for (const auto& op : ops_.op()) {
-    const auto& api_def = api_defs_map_[op.name()];
-    for (const auto& api_def_arg : api_def.out_arg()) {
-      bool found_arg = false;
-      for (const auto& op_arg : op.output_arg()) {
-        if (api_def_arg.name() == op_arg.name()) {
-          found_arg = true;
-          break;
-        }
-      }
-      ASSERT_TRUE(found_arg)
-          << "Output argument " << api_def_arg.name()
-          << " (overwritten in api_def_" << op.name()
-          << ".pbtxt) is not defined in OpDef for " << op.name();
-    }
-  }
+TEST_F(BaseApiTest, AllApiDefOutputArgsAreValid) {
+  TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_);
 }
 
 // Checks that attribute names in an ApiDef match attribute
 // names in corresponding OpDef.
-TEST_F(ApiTest, AllApiDefAttributeNamesAreValid) {
-  for (const auto& op : ops_.op()) {
-    const auto& api_def = api_defs_map_[op.name()];
-    for (const auto& api_def_attr : api_def.attr()) {
-      bool found_attr = false;
-      for (const auto& op_attr : op.attr()) {
-        if (api_def_attr.name() == op_attr.name()) {
-          found_attr = true;
-        }
-      }
-      ASSERT_TRUE(found_attr)
-          << "Attribute " << api_def_attr.name() << " (overwritten in api_def_"
-          << op.name() << ".pbtxt) is not defined in OpDef for " << op.name();
-    }
-  }
+TEST_F(BaseApiTest, AllApiDefAttributeNamesAreValid) {
+  TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_);
 }
+
+class PythonApiTest : public ::testing::Test {
+ protected:
+  PythonApiTest() {
+    OpRegistry::Global()->Export(false, &ops_);
+    const std::vector<string> multi_line_fields = {"description"};
+
+    Env* env = Env::Default();
+    GetGoldenApiDefs(env, kPythonApiDefDir, &api_defs_map_);
+  }
+  OpList ops_;
+  std::unordered_map<string, ApiDef> api_defs_map_;
+};
+
+// Check that ApiDefs have a corresponding op.
+TEST_F(PythonApiTest, AllApiDefsHaveCorrespondingOp) {
+  TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
+}
+
+// Checks that input arg names in an ApiDef match input
+// arg names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefInputArgsAreValid) {
+  TestAllApiDefInputArgsAreValid(ops_, api_defs_map_);
+}
+
+// Checks that output arg names in an ApiDef match output
+// arg names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefOutputArgsAreValid) {
+  TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_);
+}
+
+// Checks that attribute names in an ApiDef match attribute
+// names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefAttributeNamesAreValid) {
+  TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
index 5d21d7bab69..ac05b54eea9 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Adds a value to the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the incremented value or a subsequent newer one.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
index 102201c4cb0..9dd28f87112 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Subtracts a value from the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the decremented value or a subsequent newer one.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
new file mode 100644
index 00000000000..9722f5ede30
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EnqueueInQueueDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
index 8da76684e5d..97fd39f6478 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
@@ -16,5 +16,6 @@ END
   description: <<END
 Note that this routine only supports wildcard characters in the
 basename portion of the pattern, not in the directory portion.
+Note also that the order of filenames returned can be non-deterministic.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
new file mode 100644
index 00000000000..d4549340fac
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
new file mode 100644
index 00000000000..b308ad1f9d2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "Roll"
+  in_arg {
+    name: "shift"
+    description: <<END
+Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+elements are shifted positively (towards larger indices) along the dimension
+specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+direction.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+`shift[i]` should occur. If the same axis is referenced more than once, the
+total shift for that axis will be the sum of all the shifts that belong to that
+axis.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has the same shape and size as the input. The elements are shifted
+positively (towards larger indices) by the offsets of `shift` along the
+dimensions of `axis`.
+END
+  }
+  summary: "Rolls the elements of a tensor along an axis."
+  description: <<END
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 00000000000..97c380700a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnravelIndex"
+  in_arg {
+    name: "indices"
+    description: <<END
+An 0-D or 1-D `int` Tensor whose elements are indices into the
+flattened version of an array of dimensions dims.
+END
+  }
+  in_arg {
+    name: "dims"
+    description: <<END
+An 1-D `int` Tensor. The shape of the array to use for unraveling
+indices.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+same shape as the indices array.
+END
+  }
+  summary: "Converts a flat index or array of flat indices into a tuple of"
+  description: <<END
+coordinate arrays.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index a1e3b21e4f2..832a55f2556 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -13,11 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/node_builder.h"
 
-
 namespace tensorflow {
 namespace {
 
@@ -44,7 +42,6 @@ Tensor make_zeros(const DataType& dtype, const TensorShapeProto& shape) {
 // third-party libraries aren't currently supported.
 class AccumulateNV2RemovePass : public GraphOptimizationPass {
  public:
-
   Status Run(const GraphOptimizationPassOptions& options) override {
     // TODO(freiss.oss@gmail.com): Substantial shared code with
     // ParallelConcatRemovePass::Run(). Consider refactoring if someone makes
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 3dd011a58e4..93539977538 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -127,10 +127,10 @@ class BFCAllocator : public VisitableAllocator {
     string DebugString(BFCAllocator* a,
                        bool recurse) NO_THREAD_SAFETY_ANALYSIS {
       string dbg;
-      strings::StrAppend(&dbg, "  Size: ", strings::HumanReadableNumBytes(size),
-                         " | Requested Size: ",
-                         strings::HumanReadableNumBytes(requested_size),
-                         " | in_use: ", in_use());
+      strings::StrAppend(
+          &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
+          " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
+          " | in_use: ", in_use());
       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
         Chunk* p = a->ChunkFromHandle(prev);
         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 0507076c8c3..fd9c4222a7a 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -88,7 +88,9 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
   // D3 is prioritized below D1.
   AddDevice("d3", "/job:a/replica:0/task:0/device:d3:0");
   EXPECT_EQ((std::vector<DeviceType>{
-                DeviceType("d2"), DeviceType("d1"), DeviceType("d3"),
+                DeviceType("d2"),
+                DeviceType("d1"),
+                DeviceType("d3"),
             }),
             types());
 }
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 20c59ad42b3..df6f4b88773 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -61,7 +61,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
-
 namespace tensorflow {
 
 namespace {
@@ -472,9 +471,9 @@ Status DirectSession::Run(const RunOptions& run_options,
   Executor::Args args;
   args.step_id = step_id_counter_.fetch_add(1);
 
-  TF_RETURN_IF_ERROR(
-      GetOrCreateExecutors(input_tensor_names, output_names, target_nodes,
-                           &executors_and_keys, &run_state_args));
+  TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_tensor_names, output_names,
+                                          target_nodes, &executors_and_keys,
+                                          &run_state_args));
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 99b33e2ef0d..b75a4f76d94 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -436,10 +436,7 @@ TEST(DirectSessionTest, FetchMultipleTimes) {
   }
 }
 
-REGISTER_OP("Darth")
-    .Input("x: float")
-    .Output("y: float")
-    .Doc(R"doc(
+REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc(
 Darth promises one return value.
 
 x: float
@@ -972,39 +969,38 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
 
   std::atomic<int32> num_done(0);
   // Runs session to compute <node>:0 using inter_op thread pool <pool>.
-  auto add_session_run_call = [use_global_pools, &def, &options, &sessions,
-                               &sessions_mu,
-                               &num_done](thread::ThreadPool* tp, Node* node,
-                                          int inter_op_pool) {
-    auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
-               inter_op_pool, node, &num_done]() {
-      RunOptions run_options;
-      run_options.set_inter_op_thread_pool(inter_op_pool);
-      std::vector<Tensor> outputs;
+  auto add_session_run_call =
+      [use_global_pools, &def, &options, &sessions, &sessions_mu, &num_done](
+          thread::ThreadPool* tp, Node* node, int inter_op_pool) {
+        auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
+                   inter_op_pool, node, &num_done]() {
+          RunOptions run_options;
+          run_options.set_inter_op_thread_pool(inter_op_pool);
+          std::vector<Tensor> outputs;
 
-      Session* session;
-      if (use_global_pools) {
-        std::unique_ptr<Session> s(NewSession(options));
-        TF_ASSERT_OK(s->Create(def));
-        session = s.get();
+          Session* session;
+          if (use_global_pools) {
+            std::unique_ptr<Session> s(NewSession(options));
+            TF_ASSERT_OK(s->Create(def));
+            session = s.get();
 
-        mutex_lock l(sessions_mu);
-        sessions.emplace_back(std::move(s));
-      } else {
-        session = sessions[0].get();
-      }
+            mutex_lock l(sessions_mu);
+            sessions.emplace_back(std::move(s));
+          } else {
+            session = sessions[0].get();
+          }
 
-      Status s = session->Run(run_options, {} /* inputs */,
-                              {node->name() + ":0"} /* output_names */, {},
-                              &outputs, nullptr /* run_metadata */);
-      TF_CHECK_OK(s);
-      ASSERT_EQ(1, outputs.size());
-      auto flat = outputs[0].flat<float>();
-      EXPECT_FLOAT_EQ(1.2, flat(0));
-      num_done.fetch_add(1);
-    };
-    tp->Schedule(fn);
-  };
+          Status s = session->Run(run_options, {} /* inputs */,
+                                  {node->name() + ":0"} /* output_names */, {},
+                                  &outputs, nullptr /* run_metadata */);
+          TF_CHECK_OK(s);
+          ASSERT_EQ(1, outputs.size());
+          auto flat = outputs[0].flat<float>();
+          EXPECT_FLOAT_EQ(1.2, flat(0));
+          num_done.fetch_add(1);
+        };
+        tp->Schedule(fn);
+      };
 
   // For blocking states:
   // - Starts at 0, BlockingOp::Compute will move to 1.
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index df9cf0c91f1..31fb128f937 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -161,14 +161,14 @@ static void TestHWAccelerator(bool enableHWTrace) {
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
-y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif  // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -181,7 +181,7 @@ y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
   (*options.config.mutable_device_count())["GPU"] = 1;
 #ifdef TENSORFLOW_USE_SYCL
   (*options.config.mutable_device_count())["SYCL"] = 1;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 9d03caff1e1..e3416da988c 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1609,7 +1609,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsWrapper* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;  // Shorthand
+          Entry* first_input = state->first_input;     // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1776,6 +1776,19 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
         entry->ref_mu = nullptr;
 
         inp->tensor = entry->val.get();
+        // The dtype of entry->ref could have been changed by another operation
+        // that ran after the operation that "produced" it executed, so
+        // re-validate that the type of the dereferenced tensor matches the
+        // expected input type.
+        if (item.input_type(i) != inp->tensor->dtype()) {
+          return AttachDef(
+              errors::InvalidArgument(
+                  i, "-th input expects type ",
+                  DataTypeString(item.input_type(i)),
+                  " but automatically dereferenced input tensor has type ",
+                  DataTypeString(inp->tensor->dtype())),
+              item.kernel->def());
+        }
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index e9c4328f29e..e1b5404b795 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -97,12 +97,11 @@ static Node* AddNoOp(Graph* g) {
 
 static Node* AddIdentity(Graph* g, Endpoint input) {
   DCHECK_LT(0, input.dtype());
-  DCHECK_LT(input.dtype(), DT_FLOAT_REF);
   NodeDef ndef;
   ndef.set_name(g->NewName(kNodeLabel));
   ndef.set_op("Identity");
   ndef.add_input(input.name());
-  AddNodeAttr("T", input.dtype(), &ndef);
+  AddNodeAttr("T", BaseType(input.dtype()), &ndef);
   Status s;
   Node* ret = g->AddNode(ndef, &s);
   TF_CHECK_OK(s);
@@ -205,7 +204,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
-    const Graph* graph = nullptr;  // Owned by exec.
+    const Graph* graph = nullptr;                            // Owned by exec.
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 9e4b617d2bd..67caeb3495c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -154,8 +154,9 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
     a.DeallocateRaw(t3);
     a.DeallocateRaw(t4);
   }
-  CheckStats(&a, 4097, 0, 1024 * sizeof(float) + 1048576 * sizeof(int64) +
-                              2048 * sizeof(double) + 10485760 * sizeof(float),
+  CheckStats(&a, 4097, 0,
+             1024 * sizeof(float) + 1048576 * sizeof(int64) +
+                 2048 * sizeof(double) + 10485760 * sizeof(float),
              10485760 * sizeof(float));
 
   // At the end, we should have coalesced all memory into one region
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 933d700f604..80a5bdbfff4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -762,9 +762,11 @@ int64 MinSystemMemory(int64 available_memory) {
   // is necessary.
   min_system_memory *= 2;
 #endif
+
 #if defined(ANDROID_TEGRA)
-  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM and Video RAM
-  min_system_memory = 1<<30;
+  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
+  // and Video RAM
+  min_system_memory = 1 << 30;
 #endif
   return min_system_memory;
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index 7763a4f2e6f..2500425359c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -108,7 +108,8 @@ TEST_F(GpuStreamUtilTest, StreamOverrides) {
   ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
              "/device:GPU:0");
   Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0");
+  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0,
+             "/cpu:0");
   Graph g(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&g));
 
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 995fd1253fb..b195de7cbac 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -88,8 +88,8 @@ ProcessState::~ProcessState() {
 }
 
 string ProcessState::MemDesc::DebugString() {
-  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
-                         gpu_registered, ", nic: ", nic_registered);
+  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
+                         ", dma: ", gpu_registered, ", nic: ", nic_registered);
 }
 
 ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
@@ -230,8 +230,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // TODO(tucker): actually maintain separate CPUAllocators for
   // different numa_nodes.  For now, just one.
   numa_node = 0;
-  mutex_lock lock(mu_);
 
+  {
+    // Here we optimize the most common use case where cuda_host_allocators_
+    // and cuda_al_ have already been populated and since we're only reading
+    // these vectors, we can get by with a shared lock. In the slower case,
+    // we take a unique lock and populate these vectors.
+    tf_shared_lock lock(mu_);
+
+    if (FLAGS_brain_gpu_record_mem_types &&
+        static_cast<int>(cuda_al_.size()) > 0) {
+      return cuda_al_[0];
+    }
+    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
+      return cuda_host_allocators_[0];
+    }
+  }
+
+  mutex_lock lock(mu_);
   // Find the first valid StreamExecutor to request CUDA host memory
   // through, since any will work.
   //
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 3b309e915cd..33a5d60eb7e 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -340,8 +340,11 @@ Status GraphExecutionState::OptimizeGraph(
     std::unordered_map<string, DeviceProperties> device_map;
     Device* cpu_device = nullptr;
     for (const auto& device : device_set_->devices()) {
-      device_map[device->name()] =
-          grappler::GetDeviceInfo(device->parsed_name());
+      DeviceProperties props = grappler::GetDeviceInfo(device->parsed_name());
+      if (props.type() == "UNKNOWN") {
+        continue;
+      }
+      device_map[device->name()] = props;
       if (device->parsed_name().id == 0 &&
           StringPiece(device->parsed_name().type) == "CPU" &&
           device->GetAllocator(AllocatorAttributes()) != nullptr) {
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index db2686ce2c4..2312e1a89fd 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -139,9 +139,7 @@ class GraphExecutionState {
 
   // The graph returned by BuildGraph may contain only the pruned
   // graph, whereas some clients may want access to the full graph.
-  const Graph* full_graph() {
-    return graph_;
-  }
+  const Graph* full_graph() { return graph_; }
 
   // Returns the node with the given name, or null if it does not exist.
   const Node* get_node_by_name(const string& name) const {
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 76b926ba400..090a16ebeb1 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -47,7 +47,7 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     const DeviceType& device_type, const Graph* g,
     const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
-  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) {
+  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL) {
     // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
     return Status::OK();
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index 2a834ddca42..a0935855719 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -36,7 +36,7 @@ TEST(MemoryTypeChecker, Int32OK) {
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -64,7 +64,7 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   // But we can insert _HostSend/_HostRecv to ensure the invariant.
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -91,7 +91,7 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index c5b76592e1b..75dce7c7feb 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -88,9 +88,9 @@ class Placer {
   void AssignAndLog(int assigned_device, Node* node) const;
   void LogDeviceAssignment(const Node* node) const;
 
-  Graph* const graph_;                           // Not owned.
-  const DeviceSet* const devices_;               // Not owned.
-  const SessionOptions* options_;                // Not owned.
+  Graph* const graph_;              // Not owned.
+  const DeviceSet* const devices_;  // Not owned.
+  const SessionOptions* options_;   // Not owned.
   const bool log_device_placement_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Placer);
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 5d87b1e279a..02c9cd5313e 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -619,9 +619,9 @@ TEST_F(PlacerTest, TestReferenceConnectionIgnoreInfeasible) {
     Node* input = ops::SourceOp(
         "TestDevice",
         b.opts().WithName("in").WithDevice("/job:a/task:0/device:fakegpu:0"));
-    Node* var = ops::SourceOp("TestVariable",
-                              b.opts().WithName("var_0").WithDevice(
-                                  "/job:a/task:0/device:fakegpu:0"));
+    Node* var =
+        ops::SourceOp("TestVariable", b.opts().WithName("var_0").WithDevice(
+                                          "/job:a/task:0/device:fakegpu:0"));
 
     // This op is specified on CPU, but in practice will be ignored,
     // because the reference edges forces it on GPU.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 12947e284a3..dd4bf6a3457 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -158,7 +158,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
 }
 
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
-    const string& device_name) {
+    const string& device_name) const {
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a1adc4b6b35..9c9c92f1ea0 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -85,7 +85,7 @@ class ProcessFunctionLibraryRuntime {
 
   static const char kDefaultFLRDevice[];
   // Returns the FunctionLibraryRuntime for the corresponding device_name.
-  FunctionLibraryRuntime* GetFLR(const string& device_name);
+  FunctionLibraryRuntime* GetFLR(const string& device_name) const;
 
   // Returns the device incarnation for the given device_name.
   Status GetDeviceIncarnation(const string& device_name, int64* incarnation);
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index 0234d4c3725..4dbe113e44e 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -60,8 +60,8 @@ const string RegisteredFactoriesErrorMessageLocked() {
                          str_util::Join(factory_types, ", "), "}.");
 }
 string SessionOptionsToString(const SessionOptions& options) {
-  return strings::StrCat("target: \"", options.target, "\" config: ",
-                         ProtoShortDebugString(options.config));
+  return strings::StrCat("target: \"", options.target,
+                         "\" config: ", ProtoShortDebugString(options.config));
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index d7e01144c9e..cb900db10af 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -226,22 +226,23 @@ void StepStatsCollector::BuildCostModel(
       if (node) {
         for (int i = 0; i < stats.output_size(); ++i) {
           const auto& output = stats.output(i);
-          cm->RecordMaxMemorySize(node, i, Bytes(output.tensor_description()
-                                                     .allocation_description()
-                                                     .allocated_bytes()),
+          cm->RecordMaxMemorySize(node, i,
+                                  Bytes(output.tensor_description()
+                                            .allocation_description()
+                                            .allocated_bytes()),
                                   stats.output(i).tensor_description().shape(),
                                   node->output_types()[i]);
-          cm->RecordAllocationId(node, i, output.tensor_description()
-                                              .allocation_description()
-                                              .allocation_id());
+          cm->RecordAllocationId(node, i,
+                                 output.tensor_description()
+                                     .allocation_description()
+                                     .allocation_id());
         }
         cm->RecordMemoryStats(node, stats.memory_stats());
         // Use hardware stats to record the execution time if they're available,
         // otherwise use the regular (less accurate) stats
         string node_name = dev_stats.regular_stats->node_stats(i).node_name();
-        if (dev_stats.hardware_stats &&
-            name_to_hw_node_stats.find(node_name) !=
-                name_to_hw_node_stats.end()) {
+        if (dev_stats.hardware_stats && name_to_hw_node_stats.find(node_name) !=
+                                            name_to_hw_node_stats.end()) {
           const NodeExecStats& hw_stats = name_to_hw_node_stats[node_name];
           cm->RecordMaxExecutionTime(
               node, Microseconds(hw_stats.op_end_rel_micros()));
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 9094824ee73..02bd8b8f3bc 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -80,7 +80,7 @@ void SYCLAllocator::ClearStats() override {
 
 size_t SYCLAllocator::RequestedSize(void* ptr) {
   mutex_lock lock(mu_);
-  if(!sycl_device_) {
+  if (!sycl_device_) {
     return 0;
   }
   const auto& buffer = sycl_device_->get_sycl_buffer(ptr);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index cca9f92c62e..550f1933322 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -20,10 +20,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 #define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -56,14 +56,13 @@ class SYCLAllocator : public Allocator {
   // Clear the SYCL device used by the Allocator
   void ClearSYCLDevice() {
     mutex_lock lock(mu_);
-    if(sycl_device_) {
+    if (sycl_device_) {
       delete sycl_device_;
       sycl_device_ = nullptr;
     }
   }
 
  private:
-
   mutable mutex mu_;
   Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_);  // owned
   AllocatorStats stats_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index cc272d156ef..7c09e0b8f19 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -187,9 +187,9 @@ class GSYCLInterface {
       type = "Unknown";
     }
 
-    return strings::StrCat("id: ", device_id, ", type: ", type, ", name: ",
-                           name.c_str(), ", vendor: ", vendor.c_str(),
-                           ", profile: ", profile.c_str());
+    return strings::StrCat(
+        "id: ", device_id, ", type: ", type, ", name: ", name.c_str(),
+        ", vendor: ", vendor.c_str(), ", profile: ", profile.c_str());
   }
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 19c14770dca..14f7727659d 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -26,7 +26,6 @@ class SYCLDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions &options, const string &name_prefix,
                        std::vector<Device *> *devices) override {
-
     auto syclInterface = GSYCLInterface::instance();
 
     size_t n = 1;
@@ -37,13 +36,11 @@ class SYCLDeviceFactory : public DeviceFactory {
 
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(
-          new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality()
-                         , syclInterface->GetShortDeviceDescription(i)
-                         , syclInterface->GetSYCLAllocator(i)
-                         , syclInterface->GetCPUAllocator(i)
-                         , syclInterface->GetSYCLContext(i))
-                       );
+      devices->push_back(new SYCLDevice(
+          options, name, Bytes(256 << 20), DeviceLocality(),
+          syclInterface->GetShortDeviceDescription(i),
+          syclInterface->GetSYCLAllocator(i), syclInterface->GetCPUAllocator(i),
+          syclInterface->GetSYCLContext(i)));
     }
 
     return Status::OK();
@@ -51,6 +48,6 @@ class SYCLDeviceFactory : public DeviceFactory {
 };
 
 REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
-}
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
index 83016b706a5..3124ed23c92 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_util.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_util.h
@@ -20,8 +20,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 
-#include "tensorflow/core/common_runtime/device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
 // For DMA helper
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 616ced3d0f3..2e1aabd1cc8 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -24,31 +24,31 @@ limitations under the License.
 namespace tensorflow {
 
 DebugGateway::DebugGateway(DirectSession* session) : session_(session) {
-  session_->node_outputs_callback_ = [this](
-      const string& node_name, const int output_slot, const Tensor* tensor,
-      const bool is_ref, OpKernelContext* ctx) {
-    if (comp_cb_ != nullptr && output_slot <= 0) {
-      // The node completion callback is invoked once for a node regardless
-      // of whether the node has zero, one or more outputs.
-      // The output_slot can be negative (-1, or kControlSlot) if
-      // node_outputs_callback_ is invoked for a node with no output. If that
-      // is the case, notify the callback that the node in question has no
-      // output.
-      comp_cb_(node_name, output_slot == 0);
-    }
+  session_->node_outputs_callback_ =
+      [this](const string& node_name, const int output_slot,
+             const Tensor* tensor, const bool is_ref, OpKernelContext* ctx) {
+        if (comp_cb_ != nullptr && output_slot <= 0) {
+          // The node completion callback is invoked once for a node regardless
+          // of whether the node has zero, one or more outputs.
+          // The output_slot can be negative (-1, or kControlSlot) if
+          // node_outputs_callback_ is invoked for a node with no output. If
+          // that is the case, notify the callback that the node in question has
+          // no output.
+          comp_cb_(node_name, output_slot == 0);
+        }
 
-    // Copy tensor values (e.g., from GPU to host) only if the
-    // value callback is not nullptr.
-    if (val_cb_ != nullptr && output_slot >= 0) {
-      CopyTensor(
-          node_name, output_slot, tensor, ctx,
-          [this, node_name, output_slot, is_ref](const Tensor* copied_tensor) {
-            val_cb_(node_name, output_slot, *copied_tensor, is_ref);
-          });
-    }
+        // Copy tensor values (e.g., from GPU to host) only if the
+        // value callback is not nullptr.
+        if (val_cb_ != nullptr && output_slot >= 0) {
+          CopyTensor(node_name, output_slot, tensor, ctx,
+                     [this, node_name, output_slot,
+                      is_ref](const Tensor* copied_tensor) {
+                       val_cb_(node_name, output_slot, *copied_tensor, is_ref);
+                     });
+        }
 
-    return Status::OK();
-  };
+        return Status::OK();
+      };
 }
 
 DebugGateway::~DebugGateway() {
@@ -86,7 +86,8 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) &&
+    if ((device->name().find("GPU:") != string::npos ||
+         device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 57583349069..b1bbd3f6980 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -390,9 +390,9 @@ TEST_F(SessionDebugMinusAXTest,
   debug_gateway.SetNodeValueCallback(
       [this, &mu, &val_callback_count, &a_debug_identity_node_name,
        &x_debug_identity_node_name, &y_debug_identity_node_name,
-       &debug_identity_tensor_vals, &callbacks_done, &kConcurrentRuns](
-           const string& node_name, const int output_slot,
-           const Tensor& tensor_value, const bool is_ref) {
+       &debug_identity_tensor_vals, &callbacks_done,
+       &kConcurrentRuns](const string& node_name, const int output_slot,
+                         const Tensor& tensor_value, const bool is_ref) {
         mutex_lock l(mu);
 
         if (node_name == a_debug_identity_node_name && output_slot == 0) {
@@ -560,21 +560,21 @@ TEST_F(SessionDebugOutputSlotWithoutOutgoingEdgeTest,
   Notification callbacks_done;
 
   std::vector<Tensor> debug_identity_tensor_vals;
-  debug_gateway.SetNodeValueCallback([this, &mu, &callbacks_done,
-                                      &debug_identity_node_name,
-                                      &debug_identity_tensor_vals](
-      const string& node_name, const int output_slot,
-      const Tensor& tensor_value, const bool is_ref) {
-    mutex_lock l(mu);
+  debug_gateway.SetNodeValueCallback(
+      [this, &mu, &callbacks_done, &debug_identity_node_name,
+       &debug_identity_tensor_vals](
+          const string& node_name, const int output_slot,
+          const Tensor& tensor_value, const bool is_ref) {
+        mutex_lock l(mu);
 
-    if (node_name == debug_identity_node_name && output_slot == 0) {
-      debug_identity_tensor_vals.push_back(tensor_value);
+        if (node_name == debug_identity_node_name && output_slot == 0) {
+          debug_identity_tensor_vals.push_back(tensor_value);
 
-      if (!callbacks_done.HasBeenNotified()) {
-        callbacks_done.Notify();
-      }
-    }
-  });
+          if (!callbacks_done.HasBeenNotified()) {
+            callbacks_done.Notify();
+          }
+        }
+      });
 
   // Add DebugIdentity watch on c:0, which does not have an outgoing edge.
   RunOptions run_opts;
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index a312f789d84..f70931e9265 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -30,7 +30,7 @@ namespace test {
 
 ::grpc::Status TestEventListenerImpl::SendEvents(
     ::grpc::ServerContext* context,
-    ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>*
+    ::grpc::ServerReaderWriter<::tensorflow::EventReply, ::tensorflow::Event>*
         stream) {
   Event event;
 
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index f81445c20bd..baa8c08fdf1 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -574,8 +574,6 @@ Status DebugIO::CloseDebugURL(const string& debug_url) {
   }
 }
 
-static Status CloseDebugURL(const string& debug_url) { return Status::OK(); }
-
 Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                     const Tensor& tensor,
                                     const uint64 wall_time_us,
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 2f83c2415b8..0807a85b8b3 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -57,7 +57,8 @@ class DebugIOUtilsTest : public ::testing::Test {
 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
   DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
                               "hidden_1/MatMul", 0, "DebugIdentity");
-  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name);
+  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2",
+            debug_node_key.device_name);
   EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
   EXPECT_EQ(0, debug_node_key.output_slot);
   EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index d0ca2a62577..cc35264b8fe 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -140,7 +140,7 @@ class GraphMgr {
     GraphMgr* graph_mgr;
   };
 
-  const WorkerEnv* worker_env_;             // Not owned.
+  const WorkerEnv* worker_env_;  // Not owned.
   DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index d1dc622ce79..1a488303ac7 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -528,8 +528,8 @@ void Master::ListDevices(const ListDevicesRequest* req,
       auto session = FindMasterSession(req->session_handle());
       if (session == nullptr) {
         done(errors::InvalidArgument(
-             "Session ", req->session_handle(),
-             " is not found. Possibly, this master has restarted."));
+            "Session ", req->session_handle(),
+            " is not found. Possibly, this master has restarted."));
         return;
       }
       core::ScopedUnref ref(session);
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 121c58762f1..f2c1f3489c3 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -61,7 +61,7 @@ class MasterTest : public ::testing::Test {
   // rpc calls.
 
   Status CreateSession(const GraphDef& def, string* handle,
-                            int64* initial_version) {
+                       int64* initial_version) {
     ::grpc::ClientContext ctx;
     CreateSessionRequest req;
     *(req.mutable_graph_def()) = def;
@@ -77,7 +77,7 @@ class MasterTest : public ::testing::Test {
   }
 
   Status ExtendSession(const string& handle, const GraphDef& def,
-                            int64 current_version, int64* new_version) {
+                       int64 current_version, int64* new_version) {
     ::grpc::ClientContext ctx;
     ExtendSessionRequest req;
     req.set_session_handle(handle);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index ac279937730..b4d18d8607e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -185,23 +185,22 @@ class GrpcMasterService : public AsyncServiceInterface {
     MutableRunStepResponseWrapper* wrapped_response =
         new NonOwnedProtoRunStepResponse(&call->response);
     call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-    master_impl_->RunStep(call_opts, wrapped_request, wrapped_response,
-                          [call, call_opts, wrapped_request, wrapped_response,
-                           trace](const Status& status) {
-                            call->ClearCancelCallback();
-                            delete call_opts;
-                            delete wrapped_request;
-                            delete trace;
-                            if (call->request.store_errors_in_response_body() &&
-                                !status.ok()) {
-                              call->response.set_status_code(status.code());
-                              call->response.set_status_error_message(
-                                  status.error_message());
-                              call->SendResponse(ToGrpcStatus(Status::OK()));
-                            } else {
-                              call->SendResponse(ToGrpcStatus(status));
-                            }
-                          });
+    master_impl_->RunStep(
+        call_opts, wrapped_request, wrapped_response,
+        [call, call_opts, wrapped_request, wrapped_response,
+         trace](const Status& status) {
+          call->ClearCancelCallback();
+          delete call_opts;
+          delete wrapped_request;
+          delete trace;
+          if (call->request.store_errors_in_response_body() && !status.ok()) {
+            call->response.set_status_code(status.code());
+            call->response.set_status_error_message(status.error_message());
+            call->SendResponse(ToGrpcStatus(Status::OK()));
+          } else {
+            call->SendResponse(ToGrpcStatus(status));
+          }
+        });
     ENQUEUE_REQUEST(RunStep, true);
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 4e203e260a1..6ae94b74417 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -89,9 +89,9 @@ class MasterService final {
     ::grpc::Status ExtendSession(::grpc::ClientContext* context,
                                  const ExtendSessionRequest& request,
                                  ExtendSessionResponse* response) override;
-    ::grpc::Status PartialRunSetup(
-        ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
-        PartialRunSetupResponse* response) override;
+    ::grpc::Status PartialRunSetup(::grpc::ClientContext* context,
+                                   const PartialRunSetupRequest& request,
+                                   PartialRunSetupResponse* response) override;
     ::grpc::Status RunStep(::grpc::ClientContext* context,
                            const RunStepRequest& request,
                            RunStepResponse* response) override;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 70418f63686..1088e9be66c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -69,8 +69,7 @@ class GrpcRemoteMaster : public MasterInterface {
     ::grpc::ClientContext ctx;
     auto trace = TraceRpc("RunStep/Client", &ctx);
     return Call(&ctx, call_options, &request->ToProto(),
-                get_proto_from_wrapper(response),
-                &MasterServiceStub::RunStep);
+                get_proto_from_wrapper(response), &MasterServiceStub::RunStep);
   }
 
   Status CloseSession(CallOptions* call_options,
@@ -114,8 +113,9 @@ class GrpcRemoteMaster : public MasterInterface {
   template <typename Request, typename Response>
   Status Call(::grpc::ClientContext* ctx, CallOptions* call_options,
               const Request* request, Response* response,
-              ::grpc::Status (MasterServiceStub::*pfunc)(
-                  ::grpc::ClientContext*, const Request&, Response*)) {
+              ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*,
+                                                         const Request&,
+                                                         Response*)) {
     ctx->set_fail_fast(false);
     SetDeadline(ctx, call_options->GetTimeout());
     return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index dd114d39c62..730124c25e9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -66,7 +66,7 @@ class GrpcBufferWriter final
     }
     // It's dangerous to keep an inlined grpc_slice as the backup slice, since
     // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an adddress held by
+    // via GRPC_SLICE_START_PTR, which will not be an address held by
     // slice_buffer_.
     have_backup_ = backup_slice_.refcount != NULL;
     byte_count_ -= count;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
index 373eecffcab..5597ee7a76a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
@@ -21,11 +21,8 @@ namespace tensorflow {
 namespace test {
 
 // ErrorOp::Compute returns an error.
-REGISTER_OP("Error")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("message: string");
+REGISTER_OP("Error").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "message: string");
 class ErrorOp : public OpKernel {
  public:
   explicit ErrorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -66,11 +63,8 @@ REGISTER_KERNEL_BUILDER(Name("InvalidRefType").Device(DEVICE_CPU),
 
 // DelayOp::AsyncCompute sleeps for "micros"-econd and then returns
 // its input.
-REGISTER_OP("Delay")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("micros: int");
+REGISTER_OP("Delay").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "micros: int");
 class DelayOp : public AsyncOpKernel {
  public:
   explicit DelayOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index b2668fae25a..d3af7417e61 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -184,8 +184,8 @@ static void BM_Helper(int iters, int width, int num_stages, int tensor_size,
 
   testing::SetLabel(
       strings::StrCat(def.node_size(), " nodes; ",
-             use_multiple_devices ? "Multi device" : "Single device",
-             "; tensor bytes/send: ", tensor_size * sizeof(float)));
+                      use_multiple_devices ? "Multi device" : "Single device",
+                      "; tensor bytes/send: ", tensor_size * sizeof(float)));
 
   std::vector<Tensor> outputs;
 
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 4766f4c33b6..9dae5b3b926 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <queue>
 
-#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index eabcaccdd1e..ef87b9834db 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 
-#include <functional>
 #include <deque>
+#include <functional>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/costmodel.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index fe2d1a12934..34a4013547b 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -81,7 +81,7 @@ void TensorResponse::InitPartial(const RecvTensorResponse& response) {
 Status TensorResponse::ParseFrom(Source* source) {
   if (!on_host_) {
     protobuf::io::CodedInputStream input(source->contents());
-    input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+    input.SetTotalBytesLimit(INT_MAX);  // Unlimited
 
     // Pre-parse into local storage, then delegate to device.
     if (!meta_.ParseFromCodedStream(&input) || !input.ConsumedEntireMessage()) {
@@ -217,7 +217,7 @@ bool TensorResponse::ParseTensorSubmessage(
 
 bool TensorResponse::ParseFast(Source* source) {
   protobuf::io::CodedInputStream input(source->contents());
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+  input.SetTotalBytesLimit(INT_MAX);  // Unlimited
   while (true) {
     auto p = input.ReadTagWithCutoff(127);
     int tag = GetTagFieldNumber(p.first);
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 702af78c880..95ca3c3b4d1 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -97,9 +97,8 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
                                            const string& tensor_name,
                                            const string& src_device,
                                            const string& dst_device,
-                                           int64 bytes,
-                                           const string& details,
-                                           const string& transfer_method_name){
+                                           int64 bytes, const string& details,
+                                           const string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
   if (details.empty()) {
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index 0efe43fde2d..6025be51704 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -21,13 +21,13 @@ void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p += 2, q++, size--) {
-      *q = p[0];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[0];
+  }
 #else
-    for (; size != 0; p += 2, q++, size--) {
-     *q = p[1];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[1];
+  }
 #endif
 }
 
@@ -35,15 +35,15 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = *p;
-      q[1] = 0;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = *p;
+    q[1] = 0;
+  }
 #else
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = 0;
-      q[1] = *p;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = 0;
+    q[1] = *p;
+  }
 #endif
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 7ab8e3ec188..8bb87483e1c 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1356,10 +1356,11 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", num_outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ",
-            num_outer_dims, " dimensions of updates.shape=",
-            c->DebugString(updates_shape), ": ", s.error_message());
+            "The outer ", num_outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", num_outer_dims,
+            " dimensions of updates.shape=", c->DebugString(updates_shape),
+            ": ", s.error_message());
       }
 
       ShapeHandle input_suffix;
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 2c2c7e7c585..96566c285a2 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,64 +12,603 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
 
-#ifndef TENSORFLOW_FRAMEWORK_DATASET_H_
-#define TENSORFLOW_FRAMEWORK_DATASET_H_
+#include <memory>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tracing.h"
+
+// Polymorphic datasets should support all primitive TensorFlow
+// types. Use this macro to expand `m(T)` once for each primitive type
+// `T`, e.g. to build a `switch` statement.
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
 
 namespace tensorflow {
-namespace dataset {
-// Registry for stateful ops that need to be used in dataset functions.
-// See below macro for usage details.
-class WhitelistedStatefulOpRegistry {
+
+// Interface for reading values from a key-value store.
+// Used for restoring iterator state.
+class IteratorStateReader {
  public:
-  Status Add(StringPiece op_name) {
-    op_names_.insert(op_name);
+  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
+  virtual Status ReadScalar(StringPiece key, string* val) = 0;
+  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
+  virtual bool Contains(StringPiece key) = 0;
+
+  virtual ~IteratorStateReader() {}
+};
+
+// Interface for writing values to a key-value store.
+// Used for saving iterator state.
+class IteratorStateWriter {
+ public:
+  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
+  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
+  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
+
+  virtual ~IteratorStateWriter() {}
+};
+
+// Forward declarations to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+class GraphDatasetBase;
+class Node;
+
+// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
+class GraphDefBuilderWrapper {
+ public:
+  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
+
+  // Adds a Const node with scalar value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  template <typename T>
+  Status AddScalar(const T& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+    val_t.scalar<T>()() = val;
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddScalar: Failed to build Const op.");
+    }
     return Status::OK();
   }
 
-  bool Contains(StringPiece op_name) {
-    return op_names_.find(op_name) != op_names_.end();
+  // Adds a Const node with vector value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
+  template <typename T>
+  Status AddVector(const std::vector<T>& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
+                          TensorShape({static_cast<int64>(val.size())}));
+    for (int i = 0; i < val.size(); i++) {
+      val_t.flat<T>()(i) = val[i];
+    }
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddVector: Failed to build Const op.");
+    }
+    return Status::OK();
   }
 
-  static WhitelistedStatefulOpRegistry* Global() {
-    static WhitelistedStatefulOpRegistry* reg =
-        new WhitelistedStatefulOpRegistry;
-    return reg;
+  // Adds a Const node with Tensor value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddTensor(const Tensor& val, Node** output) {
+    AddTensorInternal(val, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddTensor: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs, Node** output) {
+    return AddDataset(dataset, inputs, {}, output);
+  }
+
+  // Adds a node corresponding to the `DatasetType` to the Graph.
+  // Return value of `DatasetType::op_name()` is used as the op type for the
+  // node.
+  // Values for the output_types and output_shapes node attributes are also
+  // written if those attributes are defined in the OpDef.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs,
+                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+                    Node** output) {
+    std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
+    for (int i = 0; i < inputs.size(); i++) {
+      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
+    }
+    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
+  }
+
+  Status AddDataset(
+      const GraphDatasetBase* dataset,
+      const std::vector<std::pair<size_t, Node*>>& inputs,
+      const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+      Node** output);
+
+  // Adds a user-defined function with name `function_name` to the graph and
+  // recursively adds all functions it references. If a function with a matching
+  // name has already been added, returns with OK status. If a user-defined with
+  // name `function_name` is not found in the FunctionLibraryDefinition, returns
+  // an InvalidArgumentError. If the function with name `function_name` or any
+  // of its dependent functions are stateful, returns an InvalidArgument error.
+  Status AddFunction(OpKernelContext* ctx, const string& function_name);
+
+  template <typename T>
+  void BuildAttrValue(const T& value, AttrValue* attr) {
+    SetAttrValue(value, attr);
   }
 
  private:
-  WhitelistedStatefulOpRegistry() {}
-  WhitelistedStatefulOpRegistry(WhitelistedStatefulOpRegistry const& copy);
-  WhitelistedStatefulOpRegistry operator=(
-      WhitelistedStatefulOpRegistry const& copy);
-  std::set<StringPiece> op_names_;
+  void AddTensorInternal(const Tensor& val, Node** output);
+
+  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
+                                   const string& function_name) const {
+    const FunctionLibraryDefinition* lib_def =
+        ctx->function_library()->GetFunctionLibraryDefinition();
+    const FunctionDef* function_def = lib_def->Find(function_name);
+    if (!function_def) {
+      return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                     function_name, " in registry.");
+    }
+    for (const NodeDef& node_def : function_def->node_def()) {
+      const OpDef* op_def;
+      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
+      // TODO(b/65524810): Hack to allow functions to capture Dataset op
+      // nodes needed for FlatMap. Currently, source datasets nodes have been
+      // marked stateful to avoid constant folding since we do not have a
+      // good way of serializing them.
+      if (IsOpWhitelisted(op_def)) {
+        continue;
+      }
+      if (op_def->is_stateful()) {
+        return errors::InvalidArgument(
+            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
+            "in function ", function_name, " is stateful. ",
+            "Saving stateful functions is not supported yet.");
+      }
+    }
+    return Status::OK();
+  }
+
+  // Returns whether an op has been whitelisted for use inside map_fns.
+  // Uses a heuristic to whitelist source dataset ops which have been
+  // marked stateful due to b/65524810.
+  // Also looks up the `op_def->name` in the global
+  // `WhitelistedStatefulOpRegistry`.
+  bool IsOpWhitelisted(const OpDef* op_def) const {
+    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+            op_def->output_arg_size() == 1 &&
+            op_def->output_arg(0).type() == DT_VARIANT) ||
+           dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
+               op_def->name());
+  }
+
+  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+
+  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
+    for (auto attr : op_def->attr()) {
+      if (attr.name() == attr_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
+    if (attr_value.has_func()) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
+    } else if (attr_value.has_list()) {
+      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
+        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
+      }
+    }
+    return Status::OK();
+  }
+
+  GraphDefBuilder* b_;
 };
 
-}  // namespace dataset
+class StatsAggregator;
 
-// Use this macro to whitelist an op that is marked stateful but needs to be
-// used inside a map_fn in an input pipeline. This is only needed if you wish
-// to be able to checkpoint the state of the input pipeline. We currently
-// do not allow stateful ops to be defined inside of map_fns since it is not
-// possible to save their state.
-// Note that the state of the whitelisted ops inside functions will not be
-// saved during checkpointing, hence this should only be used if the op is
-// marked stateful for reasons like to avoid constant folding during graph
-// optimiztion but is not stateful.
-// If possible, try to remove the stateful flag on the op first.
-// Example usage:
+// A cut-down version of OpKernelContext for running computations in
+// iterators. Note that we cannot simply use OpKernelContext here
+// because we might run computation in an iterator whose lifetime is
+// not nested within the lifetime of a single OpKernelContext
+// (e.g. asynchronous prefetching).
 //
-//   WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader");
+// TODO(mrry): We will probably need to support more of
+// OpKernelContext here. For example, should allocation be handled by
+// the IteratorContext?
+// TODO(mrry): We're making some daring assumptions about the lifetime
+// of the runner passed in here. A runner will be deleted when the original
+// step ends, but all existing runners only close over session-lifetime (or
+// longer-lived) state, so we can make a copy of the function. There's nothing
+// in the definition of the API from which we took the runner to guarantee that
+// what we are doing is safe. We should formalize the properties here.
+class IteratorContext {
+ public:
+  struct Params {
+    // Interface to operating system functionality.
+    Env* env;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // A function that returns the current `StatsAggregator` instance to be
+    // used when recording statistics about the iterator.
+    //
+    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
+    // is a property of the `IteratorResource` (which this class does not know
+    // about), and (ii) it can change after the `IteratorContext` has been
+    // created. Better suggestions are welcome!
+    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
+        nullptr;
+
+    // The FunctionLibraryRuntime object to be used to make function calls.
+    FunctionLibraryRuntime* lib = nullptr;
+    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
+
+    // The Allocator to be used to allocate the output of an iterator.
+    Allocator* allocator = nullptr;
+  };
+
+  explicit IteratorContext(Params params) : params_(std::move(params)) {}
+
+  Env* env() const { return params_.env; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    if (params_.stats_aggregator_getter) {
+      return params_.stats_aggregator_getter();
+    } else {
+      return nullptr;
+    }
+  }
+
+  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
+    return params_.function_library;
+  }
+
+  FunctionLibraryRuntime* lib() { return params_.lib; }
+
+  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
+
+  Allocator* allocator(AllocatorAttributes attrs);
+
+ private:
+  Params params_;
+};
+
+// Represents the current position in a range of outputs, where the
+// range of outputs is typically represented by an `DatasetBase`,
+// defined below.
+class IteratorBase {
+ public:
+  virtual ~IteratorBase() {}
+
+  // Gets the next output from the range that this iterator is traversing.
+  //
+  // If at least one output remains in this iterator's range, that
+  // output will be stored in `*out_tensors` and `false` will be
+  // stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain in this iterator's range, `true` will
+  // be stored in `*end_of_sequence`, and the content of
+  // `*out_tensors` will be undefined.
+  //
+  // This method is thread-safe.
+  //
+  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
+  // potentially remove this method.
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // iterator.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this iterator.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // Saves the state of this iterator.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
+    return SaveInternal(writer);
+  }
+
+  // Restores the state of this iterator.
+  virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) {
+    return RestoreInternal(ctx, reader);
+  }
+
+ protected:
+  // This is needed so that sub-classes of IteratorBase can call
+  // `SaveInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status SaveParent(IteratorStateWriter* writer,
+                    const std::unique_ptr<IteratorBase>& parent) {
+    return parent->SaveInternal(writer);
+  }
+
+  // This is needed so that sub-classes of IteratorBase can call
+  // `RestoreInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
+                       const std::unique_ptr<IteratorBase>& parent) {
+    return parent->RestoreInternal(ctx, reader);
+  }
+
+  // Saves the state of this iterator recursively.
+  virtual Status SaveInternal(IteratorStateWriter* writer) {
+    return errors::Unimplemented("SaveInternal");
+  }
+
+  // Restores the state of this iterator recursively.
+  virtual Status RestoreInternal(IteratorContext* ctx,
+                                 IteratorStateReader* reader) {
+    return errors::Unimplemented("RestoreInternal");
+  }
+};
+
+// Represents a (potentially infinite) range of outputs, where each
+// output is a tuple of tensors.
+class DatasetBase : public core::RefCounted {
+ public:
+  // Returns a new iterator for iterating over the range of elements in
+  // this dataset.
+  //
+  // This method may be called multiple times on the same instance,
+  // and the resulting iterators will have distinct state. Each
+  // iterator will traverse all elements in this dataset from the
+  // start.
+  //
+  // Ownership of the created iterator will be transferred to the caller.
+  //
+  // The prefix identifies the sequence of iterators leading up to the newly
+  // created iterator.
+  virtual std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // A human-readable debug string for this dataset.
+  virtual string DebugString() = 0;
+
+  // Serializes the dataset and writes it to the `writer`.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
+    return errors::Unimplemented("DatasetBase::Save");
+  }
+
+ protected:
+  // TODO(srbs): Ideally all graph related logic should reside in
+  // GraphDatasetBase. However, that would require Datasets defined in all ops
+  // to derive from GraphDatasetBase. Once that is done we can move
+  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
+  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
+   public:
+    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
+    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
+                            Node** output) {
+      return dataset->AsGraphDefInternal(ctx, this, output);
+    }
+  };
+
+  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
+                                    DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return AsGraphDefInternal(b, node);
+  }
+
+  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return errors::Unimplemented("AsGraphDefInternal");
+  }
+};
+
+// Base-class for datasets that are built by ops.
+class GraphDatasetBase : public DatasetBase {
+ public:
+  GraphDatasetBase(OpKernelContext* ctx)
+      : op_name_(ctx->op_kernel().type_string()) {}
+
+  const string op_name() const { return op_name_; }
+
+  Status Save(OpKernelContext* ctx,
+              IteratorStateWriter* writer) const override {
+    string serialized_graph_def;
+    string output_node;
+    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
+    return Status::OK();
+  }
+
+  // Key for storing the Dataset graph in the serialized format.
+  static const char kDatasetGraphKey[];
+
+  // Key for storing the output node of the Dataset graph in the serialized
+  // format.
+  static const char kDatasetGraphOutputNodeKey[];
+
+ private:
+  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
+                   string* output_node) const;
+
+  const string op_name_;
+};
+
+// Represents an iterator that is associated with a particular parent dataset.
+template <class DatasetType>
+class DatasetIterator : public IteratorBase {
+ public:
+  struct Params {
+    // Owns one reference on the shared dataset resource.
+    const DatasetType* dataset;
+
+    // Identifies the sequence of iterators leading up to this iterator.
+    const string prefix;
+  };
+
+  explicit DatasetIterator(const Params& params) : params_(params) {
+    params_.dataset->Ref();
+  }
+
+  ~DatasetIterator() override { params_.dataset->Unref(); }
+
+  // The dataset from which this iterator was created.
+  const DatasetType* dataset() const { return params_.dataset; }
+
+  // The sequence of iterators leading up to this iterator.
+  const string prefix() const { return params_.prefix; }
+
+  const DataTypeVector& output_dtypes() const override {
+    return params_.dataset->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return params_.dataset->output_shapes();
+  }
+
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) final {
+    port::Tracing::TraceMe activity(params_.prefix);
+    Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
+    if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
+      s = errors::Internal(
+          "Iterator \"", params_.prefix,
+          "\" returned OutOfRange without setting `*end_of_sequence`. This "
+          "indicates that an error may have occurred. Original message: ",
+          s.error_message());
+      LOG(ERROR) << s;
+    }
+    return s;
+  }
+
+  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
+    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
+    return IteratorBase::Save(ctx, writer);
+  }
+
+ protected:
+  // Internal implementation of GetNext that is wrapped in tracing logic.
+  virtual Status GetNextInternal(IteratorContext* ctx,
+                                 std::vector<Tensor>* out_tensors,
+                                 bool* end_of_sequence) = 0;
+
+  string full_name(const string& name) const {
+    return strings::StrCat(prefix(), ":", name);
+  }
+
+ private:
+  Params params_;
+};
+
+// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
+// graph execution engine.
+class DatasetOpKernel : public OpKernel {
+ public:
+  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
+
+// Encapsulates the work required to plug unary Datasets into the core
+// TensorFlow graph execution engine.
+class UnaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase** output) = 0;
+};
+
+// Encapsulates the work required to plug binary Datasets into the core
+// TensorFlow graph execution engine.
+class BinaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase* another_input,
+                           DatasetBase** output) = 0;
+};
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
 //
-#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \
-  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name)
-#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \
-  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)
-#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)        \
-  static ::tensorflow::Status whitelist_op##ctr TF_ATTRIBUTE_UNUSED =      \
-      ::tensorflow::dataset::WhitelistedStatefulOpRegistry::Global()->Add( \
-          name)
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_DATASET_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/framework/dataset_stateful_op_whitelist.h b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
new file mode 100644
index 00000000000..3b48999edb3
--- /dev/null
+++ b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace dataset {
+// Registry for stateful ops that need to be used in dataset functions.
+// See below macro for usage details.
+class WhitelistedStatefulOpRegistry {
+ public:
+  Status Add(StringPiece op_name) {
+    op_names_.insert(op_name);
+    return Status::OK();
+  }
+
+  bool Contains(StringPiece op_name) {
+    return op_names_.find(op_name) != op_names_.end();
+  }
+
+  static WhitelistedStatefulOpRegistry* Global() {
+    static WhitelistedStatefulOpRegistry* reg =
+        new WhitelistedStatefulOpRegistry;
+    return reg;
+  }
+
+ private:
+  WhitelistedStatefulOpRegistry() {}
+  WhitelistedStatefulOpRegistry(WhitelistedStatefulOpRegistry const& copy);
+  WhitelistedStatefulOpRegistry operator=(
+      WhitelistedStatefulOpRegistry const& copy);
+  std::set<StringPiece> op_names_;
+};
+
+}  // namespace dataset
+
+// Use this macro to whitelist an op that is marked stateful but needs to be
+// used inside a map_fn in an input pipeline. This is only needed if you wish
+// to be able to checkpoint the state of the input pipeline. We currently
+// do not allow stateful ops to be defined inside of map_fns since it is not
+// possible to save their state.
+// Note that the state of the whitelisted ops inside functions will not be
+// saved during checkpointing, hence this should only be used if the op is
+// marked stateful for reasons like to avoid constant folding during graph
+// optimiztion but is not stateful.
+// If possible, try to remove the stateful flag on the op first.
+// Example usage:
+//
+//   WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader");
+//
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \
+  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name)
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \
+  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)        \
+  static ::tensorflow::Status whitelist_op##ctr TF_ATTRIBUTE_UNUSED =      \
+      ::tensorflow::dataset::WhitelistedStatefulOpRegistry::Global()->Add( \
+          name)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index ad301a8aa4b..70d1e20a17c 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -104,8 +104,8 @@ Status FakeInputImpl::AddInputToBuilder() {
       Status status = GetNodeAttr(*node_def_, arg_->type_list_attr(), &dts);
       if (!status.ok()) {
         return errors::InvalidArgument(
-            "Could not infer list of types for input '", arg_->name(), "': ",
-            status.error_message());
+            "Could not infer list of types for input '", arg_->name(),
+            "': ", status.error_message());
       }
       SourceList(dts);
       return Status::OK();
@@ -131,8 +131,8 @@ Status FakeInputImpl::GetN(int* n) const {
     Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n);
     if (!status.ok()) {
       return errors::InvalidArgument("Could not infer length of input '",
-                                     arg_->name(), "': ",
-                                     status.error_message());
+                                     arg_->name(),
+                                     "': ", status.error_message());
     }
   }
   return Status::OK();
@@ -153,8 +153,8 @@ Status FakeInputImpl::GetDataType(DataType* dt) const {
         *dt = attr->default_value().type();
       } else {
         return errors::InvalidArgument("Could not infer type for input '",
-                                       arg_->name(), "': ",
-                                       status.error_message());
+                                       arg_->name(),
+                                       "': ", status.error_message());
       }
     }
   } else {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 0224f252270..eae8e6c3c10 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1064,26 +1064,36 @@ Status FunctionLibraryDefinition::AddLibrary(
   return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveFunction(const string& func) {
+Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
   const auto& i = function_defs_.find(func);
-  DCHECK(i != function_defs_.end());
+  if (i == function_defs_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent function ",
+                                   func);
+  }
   function_defs_.erase(i);
+  return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveGradient(const string& func) {
+Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
   const auto& i = func_grad_.find(func);
-  DCHECK(i != func_grad_.end());
+  if (i == func_grad_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent gradient ",
+                                   func);
+  }
   func_grad_.erase(i);
+  return Status::OK();
 }
 
 void FunctionLibraryDefinition::Remove(
     const std::vector<string>& funcs,
     const std::vector<string>& funcs_with_grads) {
   for (const string& f : funcs) {
-    RemoveFunction(f);
+    Status s = RemoveFunction(f);
+    DCHECK(s.ok());
   }
   for (const string& f : funcs_with_grads) {
-    RemoveGradient(f);
+    Status s = RemoveGradient(f);
+    DCHECK(s.ok());
   }
 }
 
@@ -1264,8 +1274,8 @@ FunctionDef FunctionDefHelper::Define(const string& name,
     }
     for (const string& a : src.arg) {
       const auto iter = ret_index.find(a);
-      CHECK(iter != ret_index.end()) << "Node input '" << a << "' in '"
-                                     << src.ret[0] << "' of " << name;
+      CHECK(iter != ret_index.end())
+          << "Node input '" << a << "' in '" << src.ret[0] << "' of " << name;
       n->add_input(iter->second);
     }
     for (const string& d : src.dep) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 3bb5638cdf2..7d0e15641d0 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -312,6 +312,14 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // This operation is atomic.
   Status AddGradientDef(const GradientDef& grad);
 
+  // Remove function `func` from the library. Returns non-OK Status unless
+  // `func` is in the library.
+  Status RemoveFunction(const string& func);
+
+  // Remove gradient of function `func` from the library. Returns non-OK Status
+  // unless `func` has a gradient.
+  Status RemoveGradient(const string& func);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
@@ -384,13 +392,6 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // attr from.
   const FunctionDef* GetAttrImpl(const NodeDef& ndef) const;
 
-  // Remove function `func` from the library. `func` must be in the library.
-  void RemoveFunction(const string& func);
-
-  // Remove gradient of function `func` from the library. `func` must have
-  // a gradient.
-  void RemoveGradient(const string& func);
-
   // Remove all functions in `funcs` and all gradients of
   // functions in `funcs_with_grads` from this library.
   void Remove(const std::vector<string>& funcs,
@@ -656,7 +657,7 @@ bool RegisterOp(const string& op, Creator func);
 // Returns OK the gradient creator for the "op" is found (may be
 // nullptr if REGISTER_OP_NO_GRADIENT is used.
 Status GetOpGradientCreator(const string& op, Creator* creator);
-};
+};  // namespace gradient
 
 // Declare explicit instantiations of GetAttr
 #define GET_ATTR(T)                                          \
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index bd018b72438..1f670535d57 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -35,8 +35,8 @@ namespace tensorflow {
 
 string SummarizeGraphDef(const GraphDef& graph_def) {
   string ret;
-  strings::StrAppend(&ret, "versions = ",
-                     ProtoShortDebugString(graph_def.versions()), ";\n");
+  strings::StrAppend(
+      &ret, "versions = ", ProtoShortDebugString(graph_def.versions()), ";\n");
   for (const NodeDef& node : graph_def.node()) {
     strings::StrAppend(&ret, SummarizeNodeDef(node), ";\n");
   }
@@ -90,9 +90,9 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
           FindAttr(attr.first, *producer_op_def);
       if (producer_attr_def == nullptr) {
         return errors::InvalidArgument(
-            "Attr '", attr.first, "' missing in producer's OpDef: ",
-            SummarizeOpDef(*producer_op_def), " but found in node: ",
-            SummarizeNodeDef(*node_def));
+            "Attr '", attr.first,
+            "' missing in producer's OpDef: ", SummarizeOpDef(*producer_op_def),
+            " but found in node: ", SummarizeNodeDef(*node_def));
       }
       // ...and it has the same value as the default in producer,
       if (producer_attr_def->has_default_value() &&
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 99a5d0a054e..8249059c294 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -44,7 +44,7 @@ typedef Eigen::QUInt16 quint16;
 }  // namespace tensorflow
 
 namespace Eigen {
-// TOOD(xpan): We probably need to overwrite more methods to have correct eigen
+// TODO(xpan): We probably need to overwrite more methods to have correct eigen
 // behavior. E.g. loest(), is_integer, etc. See NumTraits.h in eigen.
 template <>
 struct NumTraits<tensorflow::bfloat16>
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index a4e8add6c49..2d035ab90d0 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -170,20 +170,20 @@ const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def) {
   return nullptr;
 }
 
-#define VALIDATE(EXPR, ...)                                          \
-  do {                                                               \
-    if (!(EXPR)) {                                                   \
-      return errors::InvalidArgument(__VA_ARGS__, "; in OpDef: ",    \
-                                     ProtoShortDebugString(op_def)); \
-    }                                                                \
+#define VALIDATE(EXPR, ...)                                            \
+  do {                                                                 \
+    if (!(EXPR)) {                                                     \
+      return errors::InvalidArgument(                                  \
+          __VA_ARGS__, "; in OpDef: ", ProtoShortDebugString(op_def)); \
+    }                                                                  \
   } while (false)
 
 static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
                           bool output, std::set<string>* names) {
   const string suffix = strings::StrCat(
       output ? " for output '" : " for input '", arg.name(), "'");
-  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()), "Duplicate name: ",
-           arg.name());
+  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()),
+           "Duplicate name: ", arg.name());
   VALIDATE(HasAttrStyleType(arg), "Missing type", suffix);
 
   if (!arg.number_attr().empty()) {
@@ -250,8 +250,8 @@ Status ValidateOpDef(const OpDef& op_def) {
   std::set<string> names;  // for detecting duplicate names
   for (const auto& attr : op_def.attr()) {
     // Validate name
-    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()), "Duplicate name: ",
-             attr.name());
+    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()),
+             "Duplicate name: ", attr.name());
     DataType dt;
     VALIDATE(!DataTypeFromString(attr.name(), &dt), "Attr can't have name ",
              attr.name(), " that matches a data type");
@@ -680,8 +680,8 @@ Status OpDefAddedDefaultsUnchanged(const OpDef& old_op,
     if (!penultimate_attr.has_default_value() ||
         !new_attr->has_default_value()) {
       return errors::InvalidArgument("Missing default for attr '",
-                                     penultimate_attr.name(), "' in op: ",
-                                     SummarizeOpDef(new_op));
+                                     penultimate_attr.name(),
+                                     "' in op: ", SummarizeOpDef(new_op));
     }
 
     // Actually test that the attr's default value hasn't changed.
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 28809c11c58..2b9812d4fcb 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -200,10 +200,11 @@ TEST_F(ValidateOpDefTest, BadAttrDefault) {
                           "default_value { list { s: ['foo'] } } }"),
                 "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
                 "'BadAttrDef'");
-  ExpectFailure(TestBuilder(OpDefBuilder("GoodAttrDef")
-                                .Attr("a: list(type) >=2 = [DT_STRING]")),
-                "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
-                "'GoodAttrDef'");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("GoodAttrDef").Attr("a: list(type) >=2 = [DT_STRING]")),
+      "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
+      "'GoodAttrDef'");
 }
 
 TEST_F(ValidateOpDefTest, NoRefTypes) {
@@ -213,9 +214,10 @@ TEST_F(ValidateOpDefTest, NoRefTypes) {
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrDef").Attr("T: type = DT_INT32_REF")),
       "AttrValue must not have reference type value of int32_ref");
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrDef")
-                                .Attr("T: list(type) = [DT_STRING_REF]")),
-                "AttrValue must not have reference type value of string_ref");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrDef").Attr("T: list(type) = [DT_STRING_REF]")),
+      "AttrValue must not have reference type value of string_ref");
 }
 
 TEST_F(ValidateOpDefTest, BadAttrMin) {
@@ -245,9 +247,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: numbertype = DT_INT32")));
   // Not in list of allowed types.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: numbertype = DT_STRING")),
-                "attr 'x' of string is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: numbertype = DT_STRING")),
+      "attr 'x' of string is not in the list of allowed values");
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrtude")
                       .Attr("x: list(realnumbertype) = [DT_COMPLEX64]")),
@@ -260,9 +263,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: {'foo', 'bar'} = 'bar'")));
   // Not in list of allowed strings.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: {'foo', 'bar'} = 'baz'")),
-                "attr 'x' of \"baz\" is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: {'foo', 'bar'} = 'baz'")),
+      "attr 'x' of \"baz\" is not in the list of allowed values");
   ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
                                 .Attr("x: list({'foo', 'bar'}) = ['baz']")),
                 "attr 'x' of \"baz\" is not in the list of allowed values");
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 870bbb141b8..5f2eb9d99ab 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -296,7 +296,6 @@ static void RenameInDocs(const string& from, const string& to,
   }
 }
 
-
 namespace {
 
 // Initializes given ApiDef with data in OpDef.
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index 94fe194a1a5..ff38e4b2214 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -47,7 +47,6 @@ string PBTxtToMultiline(StringPiece pbtxt,
                         const std::vector<string>& multi_line_fields);
 string PBTxtFromMultiline(StringPiece multiline_pbtxt);
 
-
 // Takes a list of files with ApiDefs text protos, and allows you to
 // look up the specific ApiDef for any given op.
 class ApiDefMap {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 16bf5c256f9..fd2d06be989 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -101,7 +101,8 @@ OpKernel::OpKernel(OpKernelConstruction* context)
 
   // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
+               context->device_type() != DeviceType(DEVICE_SYCL);
 }
 
 OpKernel::~OpKernel() {}
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 94a9d1335a7..b53b877f28d 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -510,10 +510,9 @@ TEST_F(OpKernelBuilderTest, BuilderBoth) {
 }
 
 REGISTER_OP("BuildTypeAttr").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeAttr").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
   ExpectSuccess("BuildTypeAttr", DEVICE_CPU, {"T|type|DT_FLOAT"});
@@ -525,10 +524,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
 }
 
 REGISTER_OP("BuildTypeListAttr").Attr("T: list(type)");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeListAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<bool>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeListAttr").Device(DEVICE_CPU).TypeConstraint<bool>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
   ExpectSuccess("BuildTypeListAttr", DEVICE_CPU, {"T|list(type)|[]"});
@@ -574,14 +572,12 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
 }
 
 REGISTER_OP("DuplicateKernelForT").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index b8c771a0a19..f84ef0f953c 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -178,9 +178,9 @@ void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
           " must set *at_end=true, *produced=true, or return an error.");
     }
     if (!status.ok() && produced) {
-      status = errors::Internal("ReadLocked() for ", name(),
-                                " set *produced=true *and* returned an error: ",
-                                status.ToString());
+      status = errors::Internal(
+          "ReadLocked() for ", name(),
+          " set *produced=true *and* returned an error: ", status.ToString());
     }
     if (status.ok() && at_end) {
       status = OnWorkFinishedLocked();
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index e062adffe82..e90596980f8 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -179,7 +179,7 @@ limitations under the License.
 
 // Call "m" on all types.
 #define TF_CALL_ALL_TYPES(m) \
-  TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m)
+  TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m) TF_CALL_variant(m)
 
 // Call "m" on POD and string types.
 #define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_string(m)
@@ -211,14 +211,12 @@ limitations under the License.
 #define TF_CALL_SYCL_double(m)
 #else  // TENSORFLOW_SYCL_NO_DOUBLE
 #define TF_CALL_SYCL_double(m) TF_CALL_double(m)
-#endif // TENSORFLOW_SYCL_NO_DOUBLE
+#endif  // TENSORFLOW_SYCL_NO_DOUBLE
 
 #ifdef __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)  TF_CALL_float(m)
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m)
 #else  // __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)    \
-    TF_CALL_float(m)                    \
-    TF_CALL_SYCL_double(m)
-#endif // __ANDROID_TYPES_SLIM__
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m)
+#endif  // __ANDROID_TYPES_SLIM__
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index c1fe5517c69..ab35c2f0951 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -23,7 +23,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -79,7 +79,7 @@ template <>
 struct proxy_type_pod<SYCLDevice, 4> {
   typedef float type;
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 /// If POD we use proxy_type_pod, otherwise this maps to identiy.
 template <typename Device, typename T>
@@ -99,7 +99,7 @@ struct proxy_type {
 #ifdef TENSORFLOW_USE_SYCL
 #define TF_CALL_SYCL_PROXY_TYPES(m) \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 32b8ad784d5..de148f0bd34 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -69,9 +69,7 @@ class LocalRendezvousTest : public ::testing::Test {
     rendez_ = NewLocalRendezvous();
   }
 
-  ~LocalRendezvousTest() override {
-    rendez_->Unref();
-  }
+  ~LocalRendezvousTest() override { rendez_->Unref(); }
 
   void SchedClosure(std::function<void()> fn) {
     threads_.Schedule(std::move(fn));
@@ -99,8 +97,8 @@ string V(const Tensor& tensor) {
 
 Rendezvous::ParsedKey MakeKey(const string& name) {
   string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/device:GPU:0", name,
-                                   FrameAndIter(0, 0));
+                                   "/job:mnist/replica:1/task:2/device:GPU:0",
+                                   name, FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
   return k;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index d552ec1693f..e3cc848a169 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -32,7 +32,7 @@ class ShapeRefinerTest;
 namespace grappler {
 class GraphProperties;
 class SymbolicShapeManager;
-}
+}  // namespace grappler
 
 namespace shape_inference {
 
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index a9b63ca60e4..f48a7b9c47d 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -760,7 +760,10 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   NodeDef def;
   InferenceContext c(kVersion, &def, MakeOpDef(4, 2),
                      {
-                         Unknown(), S({-1, 2}), S({1, -1, 3}), S({2, 4}),
+                         Unknown(),
+                         S({-1, 2}),
+                         S({1, -1, 3}),
+                         S({2, 4}),
                      },
                      {}, {}, {});
 
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index d8a9c0bac5b..d7517bb311d 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -582,7 +582,8 @@ TEST(TensorShapeTest, Large) {
 TEST(TensorShapeTest, Overflow) {
   int64 one = 1;
   std::vector<std::vector<int64>> overflows = {
-      {1 << 30, 1 << 30, 1 << 30}, {1 << 5, (one << 60) + 1},
+      {1 << 30, 1 << 30, 1 << 30},
+      {1 << 5, (one << 60) + 1},
   };
   for (const auto& overflow : overflows) {
     TensorShapeProto proto;
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index a8d14123009..8f480d65f25 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cmath>
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include <cmath>
 
 namespace tensorflow {
 namespace test {
diff --git a/tensorflow/core/framework/tensor_types.h b/tensorflow/core/framework/tensor_types.h
index 921f88dc0ba..a5c1a56bfc0 100644
--- a/tensorflow/core/framework/tensor_types.h
+++ b/tensorflow/core/framework/tensor_types.h
@@ -25,7 +25,8 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
 struct TTypes {
   // Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Tensor;
+                           Eigen::Aligned>
+      Tensor;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstTensor;
@@ -33,35 +34,42 @@ struct TTypes {
   // Unaligned Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> >
       UnalignedTensor;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType> >
+      UnalignedConstTensor;
 
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
-                           Eigen::Aligned> Tensor32Bit;
+                           Eigen::Aligned>
+      Tensor32Bit;
 
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<
       Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
-      Eigen::Aligned> Scalar;
+      Eigen::Aligned>
+      Scalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> ConstScalar;
+                           Eigen::Aligned>
+      ConstScalar;
 
   // Unaligned Scalar tensor of scalar type T.
-  typedef Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> > UnalignedScalar;
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> >
+      UnalignedScalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType> >
       UnalignedConstScalar;
 
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Flat;
+                           Eigen::Aligned>
+      Flat;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Vec;
+                           Eigen::Aligned>
+      Vec;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstVec;
@@ -69,16 +77,19 @@ struct TTypes {
   // Unaligned Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedFlat;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstFlat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedVec;
   typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> > UnalignedConstVec;
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstVec;
 
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Matrix;
+                           Eigen::Aligned>
+      Matrix;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstMatrix;
@@ -86,8 +97,9 @@ struct TTypes {
   // Unaligned Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> >
       UnalignedMatrix;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstMatrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType> >
+      UnalignedConstMatrix;
 };
 
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 5ddc9865633..60f2b4135a6 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -70,8 +70,8 @@ TEST(TypesTest, kDataTypeRefOffset) {
       << "Extra reference enum "
       << enum_descriptor->FindValueByNumber(e_ref)->name()
       << " without corresponding base enum with value " << e;
-  ASSERT_LT(DataType_MAX, e_ref) << "Gap in reference types, missing value for "
-                                 << e_ref;
+  ASSERT_LT(DataType_MAX, e_ref)
+      << "Gap in reference types, missing value for " << e_ref;
 
   // Make sure there are no enums defined after the last regular type before
   // the first reference type.
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 4118f14f8bf..4f3a6ec38cb 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -158,8 +158,8 @@ void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
   Ensure(id, 0);
   auto perslot = &slot_bytes_[id];
   if (!perslot->empty()) {
-    CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node="
-                                           << node->name();
+    CHECK_EQ(num_outputs, perslot->size())
+        << "Cannot resize slot_bytes, node=" << node->name();
   }
   Ensure(id, num_outputs);
 }
@@ -252,9 +252,12 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
                                     const DataType& dtype) {
   const int id = Id(node);
   if (id < 0) return;
-  CHECK_LT(output_slot, node->num_outputs())
-      << "Unexpected output slot for node " << node->DebugString() << ". Got "
-      << output_slot << " but its num_outputs is " << node->num_outputs();
+  if (output_slot >= node->num_outputs()) {
+    LOG(ERROR) << "Unexpected output slot for node " << node->DebugString()
+               << ". Got " << output_slot << " but its num_outputs is "
+               << node->num_outputs();
+    return;
+  }
   Ensure(id, node->num_outputs());
   auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
   // If the memory allocator doesn't track memory usage, let's infer a lower
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index c60a946c2cc..9b703e46938 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -198,7 +198,7 @@ class CostModel {
   // Cumulative execution time.
   std::vector<Microseconds> time_;
   // Cumulative Bytes output on each channel.
-  std::vector<gtl::InlinedVector<Bytes, 2> > slot_bytes_;
+  std::vector<gtl::InlinedVector<Bytes, 2>> slot_bytes_;
 
   // Maximum execution time
   std::vector<Microseconds> max_exec_time_;
@@ -217,7 +217,7 @@ class CostModel {
   };
   std::vector<MemUsage> max_mem_usage_;
 
-  std::vector<gtl::InlinedVector<int64, 2> > output_port_alloc_ids_;
+  std::vector<gtl::InlinedVector<int64, 2>> output_port_alloc_ids_;
 
   std::set<int64> persistent_alloc_ids_;
   std::map<string, std::set<int64>> persistent_alloc_ids_by_devices_;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index b620127d907..93d8dd6f110 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -62,8 +62,8 @@ class Node;
 class VersionDef;
 class WhileContext;
 
-class NeighborIter;  // Declared below
-class NodeIter;      // Declared below
+class NeighborIter;    // Declared below
+class NodeIter;        // Declared below
 class NodeProperties;  // Defined in .cc
 
 class Node {
diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc
index e85de71ef79..e928c81b453 100644
--- a/tensorflow/core/graph/graph_def_builder_test.cc
+++ b/tensorflow/core/graph/graph_def_builder_test.cc
@@ -26,7 +26,6 @@ namespace tensorflow {
 namespace {
 
 TEST(GraphDefBuilderTest, Version) {
-
   // Verify that our assertions will be nontrivial
   ASSERT_LT(0, TF_GRAPH_DEF_VERSION);
 
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 3df981437af..1b99d54e8e3 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -21,102 +21,101 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
-  // Since our ops are going to produce and also consume N addition tensors
-  // (Mkl) for N Tensorflow tensors, we can have following different
-  // orderings among these 2N tensors.
-  //
-  // E.g., for Tensorflow tensors A, B, and C, our ops will produce and
-  // consume A_m, B_m, and C_m additionally.
-  //
-  // INTERLEAVED: in this case 2N tensors are interleaved. So for above
-  //              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
-  //
-  // CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
-  //             by N Mkl tensors. So for above example, the ordering looks
-  //             like: A, B, C, A_m, B_m, C_m
-  //
-  // Following APIs map index of original Tensorflow tensors to their
-  // appropriate position based on selected ordering. For contiguous ordering,
-  // we need to know the total number of tensors (parameter total).
-  //
-  typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
-  // NOTE: Currently, we use contiguous ordering. If you change this, then you
-  // would need to change Mkl op definitions in nn_ops.cc.
-  static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their
+// appropriate position based on selected ordering. For contiguous ordering,
+// we need to know the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
 
-  // Get index of MetaData tensor from index 'n' of Data tensor.
-  inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
-    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-      // For interleaved ordering, Mkl tensor follows immediately after
-      // Tensorflow tensor.
-      return n + 1;
-    } else {
-      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-      // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
-      return n + total_tensors / 2;
-    }
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
   }
+}
 
-  int inline GetTensorDataIndex(int n, int total_tensors) {
-      if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-        return 2 * n;  // index corresponding to nth input/output tensor
-      } else {
-        CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-        return n;
-      }
-    }
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
 
-  int inline GetTensorMetaDataIndex(int n, int total_tensors) {
-      // Get index for TensorData first and then use mapping function
-      // to get TensorMetaData index from TensorData index.
-      int tidx = GetTensorDataIndex(n, total_tensors);
-      return DataIndexToMetaDataIndex(tidx, total_tensors);
-    }
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
 
 namespace mkl_op_registry {
-  static const char* kMklOpLabel = "MklOp";
-  static const char* kMklOpLabelPattern = "label='MklOp'";
-  // Prefix that we add to Tensorflow op name to construct Mkl op name.
-  static const char* const kMklOpPrefix = "_Mkl";
+static const char* kMklOpLabel = "MklOp";
+static const char* kMklOpLabelPattern = "label='MklOp'";
+// Prefix that we add to Tensorflow op name to construct Mkl op name.
+static const char* const kMklOpPrefix = "_Mkl";
 
-  // Get the name of Mkl op from original TensorFlow op
-  // We prefix 'Mkl' to the original op to get Mkl op.
-  inline string GetMklOpName(const string& name) {
-    return string(kMklOpPrefix) + name;
+// Get the name of Mkl op from original TensorFlow op
+// We prefix 'Mkl' to the original op to get Mkl op.
+inline string GetMklOpName(const string& name) {
+  return string(kMklOpPrefix) + name;
+}
+
+// Check whether opname with type T is registered as MKL-compliant.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as Mkl op; false otherwise
+static inline bool IsMklOp(const std::string& op_name, DataType T) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  bool result =
+      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+  return result;
+}
+
+// Check whether opname with type T is registered as MKL-compliant and
+// is element-wise.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as element-wise Mkl op;
+// false otherwise
+static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
+  if (!IsMklOp(op_name, T)) {
+    return false;
   }
+  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("Sub")) ||
+                 0 == op_name.compare(GetMklOpName("Mul")) ||
+                 0 == op_name.compare(GetMklOpName("Maximum")) ||
+                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
 
-  // Check whether opname with type T is registered as MKL-compliant.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as Mkl op; false otherwise
-  static inline bool IsMklOp(const std::string& op_name, DataType T) {
-    string kernel = KernelsRegisteredForOp(op_name);
-    bool result =
-        kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-    return result;
-  }
-
-  // Check whether opname with type T is registered as MKL-compliant and
-  // is element-wise.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as element-wise Mkl op;
-  // false otherwise
-  static inline bool IsMklElementWiseOp(const std::string& op_name,
-    DataType T) {
-    if (!IsMklOp(op_name, T)) {
-      return false;
-    }
-    bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
-                    0 == op_name.compare(GetMklOpName("Sub")) ||
-                    0 == op_name.compare(GetMklOpName("Mul")) ||
-                    0 == op_name.compare(GetMklOpName("Maximum")) ||
-                    0 == op_name.compare(GetMklOpName("SquaredDifference")));
-
-    return result;
-  }
+  return result;
+}
 }  // namespace mkl_op_registry
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 55bc401b9d6..0e8a1cb26ce 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -37,12 +37,12 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_layout_pass.h"
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -281,7 +281,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
-                                   "_MklConv2DWithBiasBackpropBias";
+        "_MklConv2DWithBiasBackpropBias";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
     csinfo_.reshape = "Reshape";
@@ -297,10 +297,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // End - element-wise ops. See note above.
 
     // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN,
-                      AddNRewrite, nullptr});
-    rinfo_.push_back({csinfo_.add,
-                      mkl_op_registry::GetMklOpName(csinfo_.add),
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite, nullptr});
+    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
@@ -337,14 +336,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsIdentity, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
                       CopyAttrsLRN, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
@@ -358,11 +357,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.mul,
-                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu,
-                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
@@ -373,8 +370,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.sub,
-                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
 
     // Add info about which ops to add workspace edge to and the slots.
@@ -388,9 +384,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
                                    IsBiasAddGradInMatMulContext};
 
-    biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
-                                   csinfo_.mkl_conv2d_with_bias,
-                                   IsBiasAddGradInConv2DWithBiasContext};
+    biasaddgrad_conv2dwithbias_context_ = {
+        csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias,
+        IsBiasAddGradInConv2DWithBiasContext};
 
     cinfo_.push_back(&biasaddgrad_matmul_context_);
     cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
@@ -410,9 +406,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   /// Structure to specify the context information used in a node rewrite rule
   typedef struct {
-    string node;     // Name of the node to be rewritten
-    string fwd;      // Name of the node in the forward pass that this node
-                     // corresponds to
+    string node;  // Name of the node to be rewritten
+    string fwd;   // Name of the node in the forward pass that this node
+                  // corresponds to
     std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
   } ContextInfo;
 
@@ -615,14 +611,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::vector<int32> ksize, strides;
     CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
     CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
-             true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
     CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
         GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
         GetTensorDim(strides, data_format, 'C') == 1) {
       return true;
     }
@@ -785,8 +780,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
       for (const Edge* fe : first_inp_of_filter->out_edges()) {
         if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
             fe->dst_input() == 0) {
-          VLOG(1) << "MklLayoutRewritePass: found "
-                  << fe->dst()->DebugString()
+          VLOG(1) << "MklLayoutRewritePass: found " << fe->dst()->DebugString()
                   << " as the forward node for matching context, backward"
                   << " node is: " << n->DebugString();
           *fwd_node = fe->dst();
@@ -803,13 +797,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //
   // @return - true (if BiasAddGrad is associated with MatMul);
   //           false otherwise.
-  static bool IsBiasAddGradInMatMulContext(const Node* n,
-                                           const Node** fwd_node,
+  static bool IsBiasAddGradInMatMulContext(const Node* n, const Node** fwd_node,
                                            void* ci) {
     return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
   }
 
-
   // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
@@ -880,10 +872,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
-    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -900,7 +893,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //                                will feed the tensor
   // @return None
   void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -970,9 +964,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
 MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
+    MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_matmul_context_;
+    MklLayoutRewritePass::biasaddgrad_matmul_context_;
 std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
 
 // We register Mkl rewrite pass for phase 1 in post partitioning group.
@@ -1041,13 +1035,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-               .Attr("value", proto)
-               .Attr("dtype", dt)
-               .Device(orig_node->def().device())  // We place this node on
-                                                   // the same device as the
-                                                   // device of the original
-                                                   // node.
-               .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1060,8 +1054,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1069,11 +1063,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g,
-    Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -1090,8 +1082,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int mkl_node_output_slot = 0;
     GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
                               &mkl_node_output_slot);
-    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
-                                                mkl_node_output_slot));
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
     (*input_idx)++;
     list_length--;
   }
@@ -1101,9 +1093,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
-    Node* orig_node, Node* n,
-    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -1234,8 +1226,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
-                                     N, &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -1336,13 +1328,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                .Attr("value", proto)
-                .Attr("dtype", dt)
-                .Device(orig_node->def().device())  // We place this node on
-                                                    // same the device as the
-                                                    // device of the original
-                                                    // node.
-                .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1355,8 +1347,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1374,7 +1366,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
@@ -1400,8 +1393,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()),
-                                        T)) {
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
       // op. Corresponding fwd op is specified in 'fwd_op' field of
@@ -1416,7 +1410,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == mkl_op_registry::GetMklOpName(ws.fwd_op) &&
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           CHECK_NOTNULL(ws_tensors);
@@ -1593,7 +1588,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                            NodeBuilder* nb) {
   DataType T;
   DataType Tshape;
 
@@ -1869,8 +1864,8 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
       if (e->IsControlEdge()) {
         CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
       } else {
-        CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(),
-                                  e->dst_input()));
+        CHECK_NOTNULL(
+            (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input()));
       }
     }
 
@@ -1941,9 +1936,9 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
       // and leave BiasAddGrad as it is. But we check for this condition
       // when we check for node rewrite rule. So we should not even come
       // here for MatMul. So we will fail now.
-        return Status(
-            error::Code::INVALID_ARGUMENT,
-            "No rewrite is required for BiasAddGrad for MatMul context.");
+      return Status(
+          error::Code::INVALID_ARGUMENT,
+          "No rewrite is required for BiasAddGrad for MatMul context.");
     }
   }
 
@@ -2012,9 +2007,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     if (e->IsControlEdge()) {
       CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
-                            e->src()->num_outputs()),
-                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
     }
   }
 
@@ -2070,7 +2066,8 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
 
   // BiasAddGrad is not an Mkl layer, so we make an exception for it.
   if (n->type_string() != csinfo_.bias_add_grad) {
-    if (!mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) {
+    if (!mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(n->type_string()), T)) {
       return nullptr;
     }
   }
@@ -2186,8 +2183,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(
-  const GraphOptimizationPassOptions& options) {
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
@@ -2215,7 +2211,7 @@ Status MklLayoutRewritePass::Run(
   return Status::OK();
 }
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2421,7 +2417,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
     csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
     csinfo_.conv2d_grad_filter_with_bias =
-                              "__MklDummyConv2DBackpropFilterWithBias";
+        "__MklDummyConv2DBackpropFilterWithBias";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
     csinfo_.identity = "Identity";
@@ -2435,11 +2431,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
-                                   "_MklConv2DBackpropFilterWithBias";
+        "_MklConv2DBackpropFilterWithBias";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
-    csinfo_.tanh       = "Tanh";
-    csinfo_.tanh_grad  = "TanhGrad";
+    csinfo_.tanh = "Tanh";
+    csinfo_.tanh_grad = "TanhGrad";
     csinfo_.reshape = "Reshape";
     csinfo_.softmax = "Softmax";
     csinfo_.split = "Split";
@@ -2456,9 +2452,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
                       CopyAttrsAddN, AddNRewrite});
-    /* rinfo_.push_back({csinfo_.add,
+    rinfo_.push_back({csinfo_.add,
                       mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite}); */
+                      CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
                       CopyAttrsPooling, AlwaysRewrite});
@@ -2474,29 +2470,28 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
                       CopyAttrsConv2D, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.conv2d_with_bias,
-                      csinfo_.mkl_conv2d_with_bias,
+    rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
-                      csinfo_.mkl_conv2d_grad_filter_with_bias,
-                      CopyAttrsConv2D, AlwaysRewrite});
+                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv2D,
+                      AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
                       CopyAttrsConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsDataType, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.lrn,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
                       CopyAttrsLRN, AlwaysRewrite});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
@@ -2507,14 +2502,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
                       CopyAttrsPooling, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.mul,
                       mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
     rinfo_.push_back({csinfo_.relu,
                       mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
@@ -2535,14 +2529,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.softmax,
                       mkl_op_registry::GetMklOpName(csinfo_.softmax),
                       CopyAttrsDataType, AlwaysRewrite});
-    /*
+
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.sub,
                       mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite});
-    */
+
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
@@ -2550,8 +2544,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // Add a rule for merging nodes
     minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
-                      csinfo_.conv2d_with_bias,
-                      GetConv2DOrBiasAdd});
+                      csinfo_.conv2d_with_bias, GetConv2DOrBiasAdd});
 
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
@@ -2846,9 +2839,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   // Default rewrite rule to be used in scenario 1 for rewrite.
   // @return - true (since we want to always rewrite)
-  static bool AlwaysRewrite(const Node* n) {
-    return true;
-  }
+  static bool AlwaysRewrite(const Node* n) { return true; }
 
   // Check if we are performing pooling on depth or batch. If it is, then we
   // do not rewrite MaxPool node to Mkl version.
@@ -2862,14 +2853,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::vector<int32> ksize, strides;
     CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
     CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
-             true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
     CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
         GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
         GetTensorDim(strides, data_format, 'C') == 1) {
       return true;
     }
@@ -2941,10 +2931,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
-    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -2961,7 +2952,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //                                will feed the tensor
   // @return None
   void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -3096,13 +3088,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-               .Attr("value", proto)
-               .Attr("dtype", dt)
-               .Device(orig_node->def().device())  // We place this node on
-                                                   // the same device as the
-                                                   // device of the original
-                                                   // node.
-               .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -3115,8 +3107,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     // Allow duplicate while adding control edge as it would fail (return
     // NULL) if we try to add duplicate edge.
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
@@ -3126,11 +3118,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g,
-    Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -3147,8 +3137,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int mkl_node_output_slot = 0;
     GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
                               &mkl_node_output_slot);
-    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
-                                                mkl_node_output_slot));
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
     (*input_idx)++;
     list_length--;
   }
@@ -3158,9 +3148,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
-    Node* orig_node, Node* n,
-    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -3292,8 +3282,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
-                                     N, &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -3394,13 +3384,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                .Attr("value", proto)
-                .Attr("dtype", dt)
-                .Device(orig_node->def().device())  // We place this node on
-                                                    // same the device as the
-                                                    // device of the original
-                                                    // node.
-                .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -3413,8 +3403,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     // Allow duplicate while adding control edge as it would fail (return
     // NULL) if we try to add duplicate edge.
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
@@ -3434,8 +3424,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-          orig_node->type_string()), T)) {
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
@@ -3461,8 +3451,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-                                          orig_node->type_string()), T)) {
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
       // op. Corresponding fwd op is specified in 'fwd_op' field of
@@ -3477,8 +3468,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == mkl_op_registry::GetMklOpName(
-                                                          ws.fwd_op) &&
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           CHECK_NOTNULL(ws_tensors);
@@ -3645,7 +3636,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                            NodeBuilder* nb) {
   DataType T;
   DataType Tshape;
 
@@ -3776,8 +3767,9 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
                                                     Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add &&
              n->type_string() == csinfo_.conv2d)) ||
-           ((n->type_string() == csinfo_.bias_add &&
-             m->type_string() == csinfo_.conv2d)), true);
+               ((n->type_string() == csinfo_.bias_add &&
+                 m->type_string() == csinfo_.conv2d)),
+           true);
 
   // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd,
   // BiasAdd is successor node, and Conv2D predecessor node.
@@ -3796,8 +3788,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(
-      GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
   // We check to ensure that data formats of both succ and pred are same.
   // We expect them to be same, so we can enforce this as assert.
   // But assert can be too strict, so we enforce this as a check.
@@ -3900,8 +3891,8 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
       // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
       // output (at slot 0).
       const int kConv2DWithBiasOutputSlot = 0;
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot,
-                                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(),
+                                  e->dst_input()));
     }
   }
 
@@ -3924,8 +3915,9 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
     std::unique_ptr<Graph>* g, Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
              n->type_string() == csinfo_.conv2d_grad_filter)) ||
-           ((n->type_string() == csinfo_.bias_add_grad &&
-             m->type_string() == csinfo_.conv2d_grad_filter)), true);
+               ((n->type_string() == csinfo_.bias_add_grad &&
+                 m->type_string() == csinfo_.conv2d_grad_filter)),
+           true);
 
   // If 'm' is BiasAddGrad, then 'n' is BackpropFilter.
   Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n;
@@ -4132,9 +4124,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
       // NULL) if we try to add duplicate edge.
       CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
-                            e->src()->num_outputs()),
-                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
     }
   }
 
@@ -4166,9 +4159,9 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
-      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
-                                        n->type_string()), T)) {
-      return nullptr;
+      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
+                                T)) {
+    return nullptr;
   }
 
   // For elementwise node, we reuse the Eigen implementation and pass the MKL
@@ -4184,29 +4177,30 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // eigen code to reduce cross-library dependency.
   VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
   if (mkl_op_registry::IsMklElementWiseOp(
-        mkl_op_registry::GetMklOpName(n->type_string()), T) ||
+          mkl_op_registry::GetMklOpName(n->type_string()), T) ||
       n->type_string().find("Identity") != string::npos) {
     VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
     bool incoming_mkl_edge = false;
     int num_parent = 0;
     for (auto parent : n->in_edges()) {
       if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is MKL op: "
-                << parent->src()->type_string();
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is MKL op: " << parent->src()->type_string();
         incoming_mkl_edge = true;
         break;
       } else {
-        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is NON-MKL op: "
-                << parent->src()->type_string();
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is NON-MKL op: " << parent->src()->type_string();
       }
     }
     if (incoming_mkl_edge == false) {
-      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which has no MKL "
+      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which "
+                 "has no MKL "
                  "parents.";
       return nullptr;
     } else {
-      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() <<
-        " which has MKL parents";
+      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string()
+              << " which has MKL parents";
     }
   }
 
@@ -4214,8 +4208,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // for this op, then we rewrite it to Mkl op.
   // Find matching RewriteInfo and then check that rewrite rule applies.
   for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string().compare(ri->name) == 0 &&
-        ri->rewrite_rule(n)) {
+    if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
       return &*ri;
     }
   }
@@ -4297,8 +4290,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(
-  const GraphOptimizationPassOptions& options) {
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
@@ -4325,7 +4317,7 @@ Status MklLayoutRewritePass::Run(
 
   return Status::OK();
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 75f7ca2d4d7..5e2a465e22c 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 namespace {
 
@@ -125,8 +125,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8")
-                        .Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -498,7 +500,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
             "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
 }
 
-
 // BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
   InitGraph(
@@ -874,11 +875,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -1273,7 +1275,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
       "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
       "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
@@ -1640,7 +1643,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }", kGPUDevice);
+      " input: ['B', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
 }
@@ -1666,7 +1670,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }", kGPUDevice);
+      " input: ['E'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
             "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
@@ -1687,7 +1692,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }", kGPUDevice);
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
@@ -1700,7 +1706,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1713,7 +1720,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }", kGPUDevice);
+      " input: ['A', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
@@ -1729,7 +1737,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1745,7 +1754,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1766,7 +1776,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
             "B->D:1;B:1->D:2;C->E;D->E:1");
@@ -1788,7 +1799,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['B:0', 'B:1', 'A']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
@@ -1808,7 +1820,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
       " attr { key: 'is_training'  value { b: true } }"
       " input: ['A', 'B', 'C', 'D', 'E'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }", kGPUDevice);
+      " input: ['A', 'F'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
             "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
@@ -1837,7 +1850,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
       "node { name: 'Y' op: 'Input'}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}", kGPUDevice);
+      " input: ['E', 'Y']}",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
             "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
@@ -1885,7 +1899,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 
 namespace {
 
@@ -1972,8 +1986,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8")
-                        .Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -2492,11 +2508,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -2891,7 +2908,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
       "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
       "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
@@ -3258,7 +3276,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }", kGPUDevice);
+      " input: ['B', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
 }
@@ -3284,7 +3303,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }", kGPUDevice);
+      " input: ['E'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
             "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
@@ -3305,7 +3325,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }", kGPUDevice);
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
@@ -3318,7 +3339,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3331,7 +3353,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }", kGPUDevice);
+      " input: ['A', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
@@ -3347,7 +3370,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3363,7 +3387,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -3384,7 +3409,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
             "B->D:1;B:1->D:2;C->E;D->E:1");
@@ -3406,7 +3432,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['B:0', 'B:1', 'A']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
@@ -3426,7 +3453,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
       " attr { key: 'is_training'  value { b: true } }"
       " input: ['A', 'B', 'C', 'D', 'E'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }", kGPUDevice);
+      " input: ['A', 'F'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
             "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
@@ -3455,7 +3483,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
       "node { name: 'Y' op: 'Input'}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}", kGPUDevice);
+      " input: ['E', 'Y']}",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
             "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
@@ -3503,7 +3532,7 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 599bb88f015..5343e6802d1 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 
 namespace tensorflow {
 
@@ -152,12 +152,12 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   string data_format;
 
   TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
-  bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                          Status::OK();
+  bool dst_dtype_found =
+      GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK();
   // We compare source and destination datatypes only when both are found.
   if (dst_dtype_found && (src_datatype != dst_datatype)) {
-    string err_msg = "T attribute of " + src->name() + " and " +
-                      dst->name() + " do not match. Will not insert" +
+    string err_msg = "T attribute of " + src->name() + " and " + dst->name() +
+                     " do not match. Will not insert" +
                      " MklToTf node in such case.";
     return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
   }
@@ -325,12 +325,12 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
     // may not be Mkl node.
     DataType src_datatype;
     DataType dst_datatype;
-    bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(src->type_string(), src_datatype));
-    bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(dst->type_string(), dst_datatype));
+    bool src_is_mkl_op =
+        (GetNodeAttr(src->def(), "T", &src_datatype) == Status::OK() &&
+         IsMklSupportedOp(src->type_string(), src_datatype));
+    bool dst_is_mkl_op =
+        (GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK() &&
+         IsMklSupportedOp(dst->type_string(), dst_datatype));
 
     // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
     if (src_is_mkl_op && !dst_is_mkl_op) {
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 172471e34bc..0d88d1ff723 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -40,7 +40,7 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests
@@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) {
   return Binary(g, "ReverseV2", tensor, axis);
 }
 
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry())
+                  .Input(input)
+                  .Input(shift)
+                  .Input(axis)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Error(Graph* g, Node* input, const string& errmsg) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 06597778bb2..eb9038d619e 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
 // Output dtype determined by lam.
 Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
 
+// Rolls tensor by an offset of <shift> along the corresponding
+// <axis> dimensions.
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis);
+
 // Generates random parameters from the truncated standard normal distribution
 // of the nput shape
 Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 01a618ed777..39bfca244ed 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
   DisableDetailedStats(false);
 }
 
-Cluster::~Cluster() {
-}
+Cluster::~Cluster() {}
 
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
   options_.config.set_allow_soft_placement(soft_placement_state);
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 2712c5b6791..862ce4ae888 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -36,10 +36,7 @@ namespace grappler {
 static std::atomic<bool> already_provisioned(false);
 
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
-    : Cluster(timeout_s),
-      num_gpus_(num_gpus),
-      expected_init_time_s_(0),
-      closing_(false) {
+    : Cluster(timeout_s), expected_init_time_s_(0), closing_(false) {
   VLOG(1) << "Number of CPU cores: " << num_cpu_cores
           << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index a254f72f0c7..90d6a04cab6 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -64,7 +64,6 @@ class SingleMachine : public Cluster {
 
   Status ClearAllocatorStats() const;
 
-  const int num_gpus_;
   std::unique_ptr<Session> session_;
   std::vector<QueueRunnerDef> queue_runner_defs_;
   string last_graph_id_;
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index 1c2c1713834..f2419224711 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,7 +102,7 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9150), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
 
   // Make this estimate accurate:
   // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index b7eaf8dc637..9e01ec5ff5b 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -78,6 +78,9 @@ struct Costs {
     MilliSeconds asMilliSeconds() const {
       return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
     }
+    static NanoSeconds infinity() {
+      return NanoSeconds(std::chrono::nanoseconds::max());
+    }
   };
 
   // We store all our times in nanoseconds. If needs be, we can always switch to
@@ -97,6 +100,8 @@ struct Costs {
   // requirements of a graph. For example, it might assume that all activations
   // are live for all of a graph's execution.
   int64 max_memory;  // Maximum main memory requirement in bytes over all ops.
+  int64 persistent_memory;
+  int64 temporary_memory;
 
   // These fields are used for TPU-related estimations. They are per-op
   // maximums, so each op is evaluated independently, but we want the maximum of
@@ -129,6 +134,8 @@ Costs::Costs() {
   compute_time = Duration::zero();
   memory_time = Duration::zero();
   max_memory = kMemoryUnknown;
+  persistent_memory = kMemoryUnknown;
+  temporary_memory = kMemoryUnknown;
   max_per_op_buffers = kMemoryUnknown;
   max_per_op_streaming = kMemoryUnknown;
 }
@@ -139,6 +146,8 @@ Costs Costs::ZeroCosts() {
   costs.compute_time = Duration::zero();
   costs.memory_time = Duration::zero();
   costs.max_memory = kZeroMemory;
+  costs.persistent_memory = kZeroMemory;
+  costs.temporary_memory = kZeroMemory;
   costs.max_per_op_buffers = kZeroMemory;
   costs.max_per_op_streaming = kZeroMemory;
   return costs;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 6bc136a3f89..5600267f6a2 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -47,6 +47,8 @@ constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
 
+static const Costs::Duration kMinComputeTime(1);
+
 namespace {
 
 string GetDataFormat(const OpInfo& op_features) {
@@ -163,18 +165,20 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
 
-      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kSend, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
+
+      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kReshape, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
+
+      {kConst, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)},
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
@@ -349,6 +353,9 @@ OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
   VLOG(1) << "Device: " << device.type() << " gflops: " << gflops
           << " gb_per_sec: " << gb_per_sec;
 
+  DCHECK_LT(0, gflops) << device.DebugString();
+  DCHECK_LT(0, gb_per_sec) << device.DebugString();
+
   return {gflops, gb_per_sec};
 }
 
@@ -404,6 +411,7 @@ Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     double operations, const OpInfo& op_features) const {
   DeviceInfo device_perf = GetDeviceInfo(op_features.device());
+
   Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.gigaops));
   VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
           << " Execution Time (ns):" << compute_cost.count();
@@ -429,6 +437,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     costs.execution_time = compute_cost + memory_cost;
   }
   costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
   return costs;
 }
 
@@ -885,6 +894,30 @@ Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
   return Costs::ZeroCosts();
 }
 
+Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  // Assign the minimum amount of time we can represent to the identity op since
+  // it tends to be really cheap.
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.compute_time;
+  return result;
+}
+
+Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.persistent_memory =
+      CalculateOutputSize(op_features, &result.inaccurate);
+
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.execution_time;
+  return result;
+}
+
 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
@@ -898,13 +931,12 @@ Costs OpLevelCostEstimator::PredictBatchMatMul(
 
 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
-  Costs costs;
+  Costs costs = Costs::ZeroCosts();
   costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
-  costs.execution_time = 1;
-  costs.compute_time = 1;
-  costs.memory_time = 0;
+  costs.compute_time = kMinComputeTime;
+  costs.execution_time = costs.compute_time;
 
   return costs;
 }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 5f541ccf04d..a292e5e97fe 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -132,6 +132,8 @@ class OpLevelCostEstimator {
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
+  Costs PredictIdentity(const OpContext& op_context) const;
+  Costs PredictVariable(const OpContext& op_context) const;
   Costs PredictBatchMatMul(const OpContext& op_context) const;
   Costs PredictMetadata(const OpContext& op_context) const;
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index d7d07ee7a55..020492a3e9e 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -323,8 +323,13 @@ Status VirtualScheduler::Init() {
   }
 
   // Get the nodes that would run to output fetch_nodes.
+  bool ill_formed = false;
   std::vector<const NodeDef*> nodes =
-      ComputeTransitiveFanin(graph, fetch_nodes);
+      ComputeTransitiveFanin(graph, fetch_nodes, &ill_formed);
+  if (ill_formed) {
+    return errors::InvalidArgument(
+        "Ill formed graph or invalid set of fetch nodes specified");
+  }
 
   // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
   // ComputeTransitiveFanin().
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 9db6d462667..5116c8183cb 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -325,7 +325,7 @@ class VirtualScheduler {
   // Boolean field for whether the cost is accurate.
   std::map<string, std::pair<int, bool>> op_costs_;
 
-  Costs graph_costs_;                // Graph cost.
+  Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
   // Auxilliary data structures for constructing NodeState and DeviceState.
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index f4e2de75a60..173ce9c09c2 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -46,6 +46,7 @@ class GraphView {
   };
 
   explicit GraphView(GraphDef* graph);
+  GraphDef* GetGraph() const { return graph_; }
   NodeDef* GetNode(const string& node_name) const;
   // Get the specified input port. Note that the special '-1' port_id can be
   // used to access the controlling nodes (i.e. the nodes connected to node_name
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index a17e1c9ff2a..b5973192610 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -18,8 +18,8 @@ limitations under the License.
 // that may be stored in the checkpoint are not restored in order to speedup the
 // initialization.
 
-#ifndef LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
-#define LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
 
 #include <stddef.h>
 #include <limits>
@@ -53,4 +53,4 @@ class FileInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 791ad34bbed..8b9885e4c16 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -285,9 +285,11 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index c5d2d47782f..8d1098d8775 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
-#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index d2da125236a..7b4ca1496cc 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -36,20 +36,20 @@ namespace grappler {
 
 namespace {
 
-int RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
-  int num_removed = 0;
+bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
+  bool removed_input = false;
   int pos = 0;
   while (pos < node->input_size()) {
     if (node->input(pos) == input) {
       node->mutable_input()->SwapElements(pos, node->input_size() - 1);
       node->mutable_input()->RemoveLast();
       node_map->RemoveOutput(NodeName(input), node->name());
+      removed_input = true;
     } else {
       ++pos;
     }
-    ++num_removed;
   }
-  return num_removed;
+  return removed_input;
 }
 
 // Remove duplicate control inputs.
@@ -71,6 +71,48 @@ void PruneControlInputs(NodeDef* node) {
 
 }  // namespace
 
+bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
+  if (!IsIdentity(node)) {
+    return true;
+  }
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
+  if (!fetch_nodes_known_) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
+  CHECK(input != nullptr) << "node = " << node.name()
+                          << " input = " << node.input(0);
+  // Don't remove Identity nodes corresponding to Variable reads or following
+  // Recv.
+  if (IsVariable(*input) || IsRecv(*input)) {
+    return false;
+  } else if (IsSwitch(*input)) {
+    // Don't turn Identity nodes following Switch into NoOp or remove them
+    // if it requires anchoring a control dependencies the Switch node, which
+    // is not valid.
+    if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) {
+      // TODO(rmlarsen): Try to remove this artificial contraint.
+      return false;
+    }
+  }
+  for (auto consumer : node_map_->GetOutputs(node.name())) {
+    if (node.input_size() > 1 && IsMerge(*consumer)) {
+      return false;
+    }
+    if (IsSwitch(*input)) {
+      for (const string& consumer_input : consumer->input()) {
+        if (consumer_input == AsControlDependency(node.name())) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -100,18 +142,8 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
     return false;
   }
 
-  // Don't turn Identity nodes inserted by Grappler after Switch into NoOp,
-  // since we cannot anchor control dependencies on Switch nodes.
-  // Don't remove Identity nodes corresponding to Variable reads.
-  if (IsIdentity(node)) {
-    const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
-    if (input != nullptr) {
-      if (IsVariable(*input) ||
-          (StringPiece(node.name()).starts_with(kConstantFoldingCtrl) &&
-           IsSwitch(*input))) {
-        return false;
-      }
-    }
+  if (!SafeToRemoveIdentity(node)) {
+    return false;
   }
 
   const std::unordered_set<string> do_not_rewrite_ops{
@@ -124,19 +156,22 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
 void DependencyOptimizer::OptimizeNode(int node_idx,
                                        SetVector<int>* nodes_to_simplify,
                                        std::set<int>* nodes_to_delete) {
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
   NodeDef* node = optimized_graph_->mutable_node(node_idx);
-
+  const bool is_noop = IsNoOp(*node);
+  const bool is_identity = IsIdentity(*node);
+  const string node_name = node->name();
   // Constant nodes with no input control dependency are always executed early,
   // so we can prune all their output control dependencies.
   if (IsConstant(*node) && node->input_size() == 0) {
-    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node->name());
+    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node_name);
     for (NodeDef* fanout : output_nodes) {
       bool optimize_fanout = false;
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
         int pos;
         string input_name = ParseNodeName(fanout->input(i), &pos);
-        if (input_name == node->name()) {
+        if (input_name == node_name) {
           if (pos < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
             fanout->mutable_input()->RemoveLast();
@@ -149,22 +184,21 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       if (optimize_fanout) {
         nodes_to_simplify->PushBack(node_to_idx_[fanout]);
         if (!data_connection) {
-          node_map_->RemoveOutput(node->name(), fanout->name());
+          node_map_->RemoveOutput(node_name, fanout->name());
         }
       }
     }
-    if (node_map_->GetOutputs(node->name()).empty() && fetch_nodes_known_ &&
-        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+    if (node_map_->GetOutputs(node_name).empty() && fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
       nodes_to_delete->insert(node_to_idx_[node]);
     }
-
     return;
   }
 
   // Change ops that only have control dependencies as outputs to NoOps.
-  if (node->op() != "NoOp" && SafeToConvertToNoOp(*node)) {
-    VLOG(1) << "***** Replacing  " << node->name() << " (" << node->op()
+  if (!is_noop && SafeToConvertToNoOp(*node)) {
+    VLOG(1) << "***** Replacing  " << node_name << " (" << node->op()
             << ") with NoOp.";
     // The outputs of this node are not consumed. Replace its inputs with
     // control dependencies and replace the op itself with the NoOp op.
@@ -186,7 +220,7 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
           old_input, optimized_graph_, node_map_.get());
       if (ctrl_inputs.insert(ctrl_input).second) {
         node->set_input(pos, ctrl_input);
-        node_map_->UpdateInput(node->name(), old_input, ctrl_input);
+        node_map_->UpdateInput(node_name, old_input, ctrl_input);
         const NodeDef* old_input_node = node_map_->GetNode(old_input);
         nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
       }
@@ -194,6 +228,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
     }
     node->set_op("NoOp");
     node->clear_attr();
+    nodes_to_simplify->PushBack(node_to_idx_[node]);
+    return;
   }
 
   // Remove NoOp nodes if the product of their fan-in and fan-out is less than
@@ -222,9 +258,30 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
   // a and x, respectively, are on the same device. Control edges across device
   // boundaries require inter-device communication (Send/Recv pairs to be
   // inserted in the graph), which is very costly.
+  //
+  // We also remove identity nodes, subject to the same constraints on number of
+  // resulting control edges and device boundary crossings:
+  //
+  // Case a)
+  //          +----------+ ---> a       +---+ ---> a
+  //    x --> | Identity | --^> b  ==>  | x | --^> b
+  //          |          | ...          |   | ...
+  //          +----------+ --^> c       +---+ --^> c
+  //
+  // Case b)
+  //    x ---> +----------+ ---> a      x ---> +---+
+  //    y --^> | Identity |        ==>  y --^> | a |
+  //    ...    |          |               ...  |   |
+  //    z --^> +----------+             z --^> +---+
+  //
+  // Case c)
+  //           +----------+             x ---> +---+
+  //    x ---> | Identity | ---> a ==>   \--^> | a |
+  //    y --^> |          | --^> b       /\    +---+
+  //           +----------+             y --^> b
 
-  if (node->op() == "NoOp") {
-    const auto& output_node_set = node_map_->GetOutputs(node->name());
+  if (is_noop || (is_identity && is_aggressive)) {
+    const auto& output_node_set = node_map_->GetOutputs(node_name);
     const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
                                              output_node_set.end());
     const int num_outputs = output_nodes.size();
@@ -233,15 +290,14 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
     if (num_inputs * num_outputs > num_inputs + num_outputs) {
       return;
     }
-    VLOG(1) << "***** Rerouting input around " << node->name();
     std::vector<NodeDef*> input_nodes;
     for (int i = 0; i < num_inputs; ++i) {
-      NodeDef* tmp = node_map_->GetNode(node->input(i));
-      CHECK_NE(tmp, nullptr);
-      input_nodes.push_back(tmp);
+      NodeDef* input_node = node_map_->GetNode(node->input(i));
+      CHECK_NE(input_node, nullptr);
+      input_nodes.push_back(input_node);
     }
 
-    // Make sure that we don't increase the number of control edges that cross
+    // Make sure that we don't increase the number of edges that cross
     // device boundaries.
     if ((num_inputs == 1 && num_outputs > 1 &&
          input_nodes[0]->device() != node->device()) ||
@@ -266,40 +322,75 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       if (num_cross_after > num_cross_before) {
         return;
       }
+      // To avoid potentially removing Identity nodes following _Recv nodes,
+      // we require that no device crossings occur in that case.
+      // TODO(rmlarsen): See if we can relax this condition.
+      if (is_identity && (num_cross_after > 0 || num_cross_before > 0)) {
+        return;
+      }
     }
+    if (is_identity && !SafeToRemoveIdentity(*node)) {
+      return;
+    }
+
+    VLOG(1) << "***** Rerouting input around\n" << node->DebugString();
+    // Now remove the node and re-wire its inputs to its outputs.
     for (auto consumer : output_nodes) {
       bool updated_consumer = false;
-      VLOG(1) << "***** Considering consumer  " << consumer->name() << "\n"
-              << consumer->DebugString();
+      VLOG(1) << "consumer before:\n" << consumer->DebugString();
       for (int i = 0; i < num_inputs; ++i) {
         const NodeDef* input = input_nodes[i];
         // Forward dependency from input to consumer if it doesn't already
         // depend on it.
-        if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
-          consumer->add_input(AsControlDependency(input->name()));
+        if (is_identity && i == 0) {
+          // Replace regular input from Identity node.
+          bool found_input = false;
+          string new_input;
+          const string& input_to_forward = node->input(0);
+          CHECK(!IsControlInput(input_to_forward));
+          for (int j = 0; j < consumer->input_size(); ++j) {
+            const string& old_input = consumer->input(j);
+            if (old_input == node_name) {
+              new_input = input_to_forward;
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            } else if (old_input == AsControlDependency(NodeName(node_name))) {
+              new_input = AsControlDependency(NodeName(input_to_forward));
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            }
+          }
+          CHECK(found_input);
           updated_consumer = true;
-          node_map_->AddOutput(input->name(), consumer->name());
-          nodes_to_simplify->PushBack(node_to_idx_[input]);
+        } else {
+          // Forward dependency from input to consumer if it doesn't already
+          // depend on it.
+          if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
+            consumer->add_input(AsControlDependency(input->name()));
+            node_map_->AddOutput(input->name(), consumer->name());
+            nodes_to_simplify->PushBack(node_to_idx_[input]);
+            updated_consumer = true;
+          }
         }
       }
       // Remove dependency on node from consumer.
-      updated_consumer |= RemoveInput(
-          consumer, AsControlDependency(node->name()), node_map_.get());
+      updated_consumer |= RemoveInput(consumer, AsControlDependency(node_name),
+                                      node_map_.get());
       if (updated_consumer) {
-        VLOG(1) << "***** Updated consumer  " << consumer->name() << " ("
-                << consumer->op() << ")";
         nodes_to_simplify->PushBack(node_to_idx_[consumer]);
       }
+      VLOG(1) << "consumer after:\n" << consumer->DebugString();
     }
-
-    node_map_->RemoveOutputs(node->name());
+    node_map_->RemoveOutputs(node_name);
     if (fetch_nodes_known_ &&
-        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
       nodes_to_delete->insert(node_idx);
 
-      // Unconnect the node from its inputs to enable further optimizations.
-      node_map_->RemoveInputs(node->name());
+      // Disconnect the node from its inputs to enable further optimizations.
+      node_map_->RemoveInputs(node_name);
       node->clear_input();
     }
   }
@@ -330,13 +421,18 @@ Status DependencyOptimizer::OptimizeDependencies() {
   std::set<int> nodes_to_delete;
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
     const NodeDef& node = optimized_graph_->node(i);
-    if (node.op() == "NoOp" || IsConstant(node) || SafeToConvertToNoOp(node)) {
+    if (IsNoOp(node) || IsIdentity(node) || IsConstant(node) ||
+        SafeToConvertToNoOp(node)) {
       nodes_to_simplify.PushBack(i);
     }
   }
   while (!nodes_to_simplify.Empty()) {
-    OptimizeNode(nodes_to_simplify.PopBack(), &nodes_to_simplify,
-                 &nodes_to_delete);
+    int node_to_simplify = nodes_to_simplify.PopBack();
+    // Discard nodes that were marked for deletion already.
+    while (nodes_to_delete.find(node_to_simplify) != nodes_to_delete.end()) {
+      node_to_simplify = nodes_to_simplify.PopBack();
+    }
+    OptimizeNode(node_to_simplify, &nodes_to_simplify, &nodes_to_delete);
   }
 
   if (fetch_nodes_known_) {
@@ -431,9 +527,10 @@ Status DependencyOptimizer::TransitiveReduction() {
       if (longest_distance[target] > 1) {
         const int input_slot = control_output.second;
         control_edges_to_remove[target].emplace(input_slot, source);
-        VLOG(1) << "Removing edge from:\n"
-                << optimized_graph_->node(source).DebugString() << "\n\nto:\n\n"
-                << optimized_graph_->node(target).DebugString();
+        //        VLOG(1) << "Removing edge from:\n"
+        //                << optimized_graph_->node(source).DebugString() <<
+        //                "\n\nto:\n\n"
+        //                << optimized_graph_->node(target).DebugString();
       }
     }
   }
@@ -473,8 +570,8 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   *optimized_graph_ = item.graph;
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
-
   CleanControlInputs();
+
   const int num_iterations = 2;
   for (int iteration = 0; iteration < num_iterations; ++iteration) {
     Status topo_sort_status;
@@ -491,9 +588,12 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     } else {
       LOG(ERROR) << topo_sort_status.error_message();
     }
-
-    // Turn nodes with only control outputs into NoOps, prune NoOps.
+    // Turn nodes with only control outputs into NoOps, prune NoOp and Identity
+    // nodes.
     TF_RETURN_IF_ERROR(OptimizeDependencies());
+
+    // Dedup control inputs.
+    CleanControlInputs();
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 02d8a0f32a9..0f47528a043 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -43,6 +43,9 @@ class DependencyOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  // Returns true if node is not an Identity node or if it is an Identity
+  // that is safe to remove.
+  bool SafeToRemoveIdentity(const NodeDef& node);
   // Returns true if it is safe to convert node to NoOp.
   bool SafeToConvertToNoOp(const NodeDef& node);
   // Removes all duplicate control dependencies.
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index f5027a4a99e..b8facb9deae 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -167,14 +167,16 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) {
       ops::Const(scope.WithOpName("c2").WithControlDependencies(ctrl_dep_id),
                  {1.0f, 2.0f}, {1, 2});
   Output neg1 = ops::Neg(scope.WithOpName("neg1"), s.output_false);
+  Output neg2 = ops::Neg(scope.WithOpName("neg2"), ctrl_dep_id);
 
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("c1");
   item.fetch.push_back("c2");
   item.fetch.push_back("neg1");
+  item.fetch.push_back("neg2");
 
-  DependencyOptimizer optimizer;
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -323,25 +325,148 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  Output z = ops::RandomUniform(s.WithOpName("z"), {1, 2}, DT_FLOAT);
+
+  // Identity nodes to be removed.
+  // Case a) with a single input- and multiple outputs.
+  auto id_a = ops::Identity(s.WithOpName("id_a"), x);
+  // Case b) with multiple inputs and a single output.
+  auto id_b = ops::Identity(
+      s.WithOpName("id_b").WithControlDependencies(y).WithControlDependencies(
+          z),
+      x);
+  // Case c) with two inputs and two outputs.
+  auto id_c = ops::Identity(s.WithOpName("id_c").WithControlDependencies(y), x);
+
+  // Output for Case a.
+  Output a_a = ops::Identity(s.WithOpName("a_a"), id_a);
+  Output a_b = ops::Identity(s.WithOpName("a_b"), id_a);
+  Output a_c =
+      ops::Identity(s.WithOpName("a_c").WithControlDependencies(id_a), z);
+  Output a_d =
+      ops::Identity(s.WithOpName("a_d").WithControlDependencies(id_a), z);
+  // Output for Case b.
+  Output b_a = ops::Identity(s.WithOpName("b_a"), id_b);
+  // Output for Case c.
+  Output c_a = ops::Identity(s.WithOpName("c_a"), id_c);
+  Output c_b =
+      ops::Identity(s.WithOpName("c_b").WithControlDependencies(id_c), z);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"a_a", "a_b", "a_c", "a_d", "b_a", "c_a", "c_b"};
+
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 3, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id_a", node.name());
+    EXPECT_NE("id_b", node.name());
+    EXPECT_NE("id_c", node.name());
+    if (node.name() == "a_a" || node.name() == "a_b") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    }
+    if (node.name() == "a_c" || node.name() == "a_d") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+    }
+    if (node.name() == "b_a") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+      EXPECT_EQ("^z", node.input(2));
+    }
+    if (node.name() == "c_a") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    }
+    if (node.name() == "c_b") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) {
+  // Corner cases with repeated inputs.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable x(scope.WithOpName("x"), {}, DT_BOOL);
+  ops::Variable y(scope.WithOpName("y"), {}, DT_BOOL);
+  ops::Switch sw(scope.WithOpName("switch"), x, x);
+  // id0 should be removed.
+  Output id0 = ops::Identity(scope.WithOpName("id0"), sw.output_true);
+  // id1 should not be removed, since it would anchor a control dependency
+  // on the switch.
+  Output id1 = ops::Identity(scope.WithOpName("id1"), sw.output_false);
+  Output or0 = ops::LogicalOr(scope.WithOpName("or0"), id0, id0);
+  Output or1 = ops::LogicalOr(scope.WithOpName("or1"), id0, y);
+  Output or2 = ops::LogicalOr(
+      scope.WithOpName("or2").WithControlDependencies(id1), y, y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("or0");
+  item.fetch.push_back("or1");
+  item.fetch.push_back("or2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id0", node.name());
+    if (node.name() == "or0") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("switch:1", node.input(1));
+    }
+    if (node.name() == "or1") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+    if (node.name() == "or2") {
+      // or1 should be unchanged.
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^id1", node.input(2));
+    }
+  }
+}
+
 TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
   Output x = ops::Square(s.WithOpName("x"), c);
-  Output id1 = ops::Identity(s.WithOpName("id1"), x);
-  Output id2 =
-      ops::Identity(s.WithOpName("id2").WithControlDependencies({x}), id1);
+  Output neg1 = ops::Neg(s.WithOpName("neg1"), x);
+  Output neg2 =
+      ops::Neg(s.WithOpName("neg2").WithControlDependencies({x}), neg1);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch.push_back("id2");
-  DependencyOptimizer optimizer;
+  item.fetch.push_back("neg2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(4, output.node_size());
-  EXPECT_EQ("id2", output.node(3).name());
+  EXPECT_EQ("neg2", output.node(3).name());
   EXPECT_EQ(1, output.node(3).input_size());
-  EXPECT_EQ("id1", output.node(3).input(0));
+  EXPECT_EQ("neg1", output.node(3).input(0));
 }
 
 TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
@@ -356,20 +481,21 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   Output grappler_added_id = ops::Identity(
       scope.WithOpName("ConstantFoldingCtrl/switch_1"), s.output_true);
   Output c1 = ops::Const(scope.WithOpName("c1")
-                             .WithControlDependencies(id0)
                              .WithControlDependencies(id_after_var)
                              .WithControlDependencies(grappler_added_id),
                          {1.0f, 2.0f}, {1, 2});
   Output id1 = ops::Identity(scope.WithOpName("id1"), c1);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), id0);
   Output fetch =
       ops::Identity(scope.WithOpName("fetch").WithControlDependencies(id1), c1);
 
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("c1");
+  item.fetch.push_back("id2");
   item.fetch.push_back("fetch");
 
-  DependencyOptimizer optimizer;
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -377,8 +503,8 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   EXPECT_EQ(item.graph.node_size() - 2, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1" nor
-    // "id_after_var" should be eliminated.
+    // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1",
+    // "id_after_var, nor "id2"" should be eliminated.
     EXPECT_NE("id0", node.name());
     EXPECT_NE("id1", node.name());
     if (node.name() == "c1") {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 735d78e7eee..433b3564fe5 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2041,17 +2041,6 @@ class DataLayoutOptimizer : GraphProcessor {
   const LayoutOptimizer::TuningConfig& config_;
 };
 
-int GetNumTranspose(const GraphDef& graph) {
-  int number = 0;
-  for (const auto& node : graph.node()) {
-    if (IsTranspose(node)) {
-      number++;
-    }
-  }
-  VLOG(1) << "Number of Transpose nodes: " << number;
-  return number;
-}
-
 int GetNumGPUs(const Cluster& cluster) {
   auto devices = cluster.GetDevices();
   int num_gpus = 0;
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index f537ecc41b9..ffa03db2623 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -496,7 +498,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     if (!IsAddN(node)) {
       continue;
     }
-    // There is nothing to gain by optimizing nodes with 2 inputs of fewer.
+    // There is nothing to gain by optimizing nodes with 2 or fewer inputs.
     if (view.NumFanins(node, false) <= 2) {
       continue;
     }
@@ -558,6 +560,54 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
       VLOG(1) << "Missing properties for " << node->name();
       continue;
     }
+
+    // Compute a topological ordering for the node fanin.
+    std::unordered_map<NodeDef*, int> topo_order;
+    ReverseDfs(view, {node}, nullptr,
+               [&topo_order](NodeDef* n) {
+                 int topo_index = topo_order.size();
+                 topo_order[n] = topo_index;
+               },
+               nullptr);
+
+    std::vector<int> input_topo_index;
+
+    for (int i = 0; i < node->input_size(); ++i) {
+      const string& input = node->input(i);
+      const string node_name = NodeName(input);
+      NodeDef* node = view.GetNode(node_name);
+      input_topo_index.push_back(topo_order.at(node));
+    }
+    int min_input_topo_index = INT_MAX;
+    int min_input_id = -1;
+    for (int i = 0; i < node->input_size(); ++i) {
+      if (IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      const int current = input_topo_index[i];
+      if (current < min_input_topo_index) {
+        min_input_topo_index = current;
+        min_input_id = i;
+      }
+    }
+    CHECK_LE(0, min_input_id);
+    std::vector<string> pre_ctrl_deps;
+    std::vector<string> post_ctrl_deps;
+    for (int i = node->input_size() - 1; i >= 0; --i) {
+      if (!IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      if (input_topo_index[i] < min_input_topo_index) {
+        // These control dependencies can be executed before the node.
+        pre_ctrl_deps.push_back(node->input(i));
+      } else {
+        // These control dependencies should be executed after the node.
+        post_ctrl_deps.push_back(node->input(i));
+      }
+    }
+
     const TensorShapeProto& shape =
         properties.GetOutputProperties(node->name())[0].shape();
     DataType dtype = node->attr().at("T").type();
@@ -572,13 +622,19 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     *(*tmp_var->mutable_attr())["shape"].mutable_shape() = shape;
     (*tmp_var->mutable_attr())["var_name"].set_s(tmp_var->name());
 
+    for (const string& ctrl_dep : pre_ctrl_deps) {
+      *tmp_var->add_input() = ctrl_dep;
+    }
+    *tmp_var->add_input() =
+        AsControlDependency(NodeName(node->input(min_input_id)));
+
     // Initialize it to zero
     NodeDef* zeros = item->graph.add_node();
     zeros->set_name(strings::StrCat(node->name(), "/tmp_var_zeros"));
     zeros->set_op("ZerosLike");
     zeros->set_device(device);
     (*zeros->mutable_attr())["T"].set_type(dtype);
-    *zeros->add_input() = node->input(0);
+    *zeros->add_input() = node->input(min_input_id);
 
     NodeDef* initialize = item->graph.add_node();
     initialize->set_name(strings::StrCat(node->name(), "/tmp_var_initializer"));
@@ -592,9 +648,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     std::vector<NodeDef*> accumulates;
     for (int i = 0; i < node->input_size(); ++i) {
       const string& input = node->input(i);
-      if (IsControlInput(input)) {
-        *zeros->add_input() = input;
-      } else {
+      if (!IsControlInput(input)) {
         NodeDef* accumulate = item->graph.add_node();
         accumulate->set_name(
             strings::StrCat(node->name(), "/tmp_var_accum_", i));
@@ -617,6 +671,10 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     for (const NodeDef* accum : accumulates) {
       *node->add_input() = AsControlDependency(accum->name());
     }
+    for (const string& ctrl_dep : post_ctrl_deps) {
+      *node->add_input() = ctrl_dep;
+    }
+
     updated_graph = true;
   }
 
@@ -828,8 +886,7 @@ static NodeDef* FindSwapOutTrigger(
   const std::unordered_set<GraphView::InputPort, GraphView::HashPort>& fanout =
       view.GetFanout(generator);
   NodeDef* trigger = nullptr;
-  Costs::NanoSeconds earliest_fanout(
-      static_cast<double>(std::numeric_limits<int64>::max() >> 2));
+  Costs::NanoSeconds earliest_fanout(Costs::NanoSeconds::infinity());
 
   for (const auto& port : fanout) {
     if (port.node == node) {
@@ -861,6 +918,15 @@ static bool IsSwappable(GraphView::InputPort input) {
   return !IsRefType(dtype);
 }
 
+struct MemInfo {
+  GraphView::OutputPort port;
+  int64 memory_used;
+  std::vector<GraphView::InputPort> uses_left;
+  double fitness;
+
+  bool operator<(const MemInfo& other) const { return fitness < other.fitness; }
+};
+
 static bool IdentifySwappingCandidates(
     Cluster* cluster, GrapplerItem* item, std::unordered_set<string>* skip_list,
     std::unordered_map<NodeDef*, SwapInfo>* nodes_to_swap) {
@@ -890,31 +956,56 @@ static bool IdentifySwappingCandidates(
       continue;
     }
     int64 required_savings = mem_usage.used_memory - prop.memory_size();
-    // TODO(bsteiner): sort the tensors by how long they're live.
 
-    std::unordered_map<string, Costs::NanoSeconds> execution_times;
+    std::unordered_map<string, Costs::NanoSeconds> op_completion_times;
     {
-      std::unordered_map<const NodeDef*, Costs::NanoSeconds>
-          tmp_execution_times;
-      if (!EstimateEarliestExecutionTimes(*item, cluster, &tmp_execution_times)
-               .ok()) {
+      VirtualCluster vcluster(cluster->GetDevices());
+      if (!vcluster.Provision().ok()) {
         return false;
       }
-      for (const auto& exec_time : tmp_execution_times) {
-        execution_times.emplace(exec_time.first->name(), exec_time.second);
+      if (!vcluster.Initialize(*item).ok()) {
+        return false;
+      }
+      RunMetadata metadata;
+      Status s = vcluster.Run(item->graph, item->feed, item->fetch, &metadata);
+      if (!s.ok() && s.code() != error::RESOURCE_EXHAUSTED) {
+        return false;
+      }
+
+      for (const auto& dev_stats : metadata.step_stats().dev_stats()) {
+        for (const auto& node_stats : dev_stats.node_stats()) {
+          Costs::NanoSeconds exec_time =
+              Costs::NanoSeconds(1) +
+              Costs::MicroSeconds(node_stats.all_start_micros() +
+                                  node_stats.op_end_rel_micros());
+          op_completion_times.emplace(node_stats.node_name(), exec_time);
+        }
       }
     }
 
+    Costs::Duration peak_time = -1;
+    for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.allocation_time > peak_time) {
+        peak_time = live_tensor.allocation_time;
+      }
+    }
+
+    std::vector<MemInfo> mem_state;
+
     GraphView graph(&item->graph);
     for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.memory_used <= 1024) {
+        // Don't bother with small tensors.
+        continue;
+      }
       if (live_tensor.deallocation_time - live_tensor.allocation_time <=
           Costs::Duration(1e6)) {
         // Not enough time to swap.
         VLOG(1) << "Not enough time to swap: skipping " << live_tensor.node;
         continue;
       }
-      if (live_tensor.memory_used <= 1024) {
-        // Don't bother with small tensors.
+
+      if (skip_list->find(live_tensor.node) != skip_list->end()) {
         continue;
       }
       GraphView::OutputPort port =
@@ -922,56 +1013,77 @@ static bool IdentifySwappingCandidates(
       if (!IsSwappable(graph, port)) {
         continue;
       }
-      Costs::NanoSeconds execution_time(-1);
-      GraphView::InputPort fanout_to_swap;
+      MemInfo mem_info;
+      mem_info.port = port;
+      mem_info.memory_used = live_tensor.memory_used;
+      Costs::Duration allocation_time = live_tensor.allocation_time;
+      Costs::Duration earliest_use(Costs::Duration::infinity());
+      bool valid = true;
       for (GraphView::InputPort input : graph.GetFanout(port)) {
-        if (skip_list->find(input.node->name()) != skip_list->end()) {
+        // Get execution time.
+        auto it = op_completion_times.find(input.node->name());
+        if (it == op_completion_times.end()) {
+          valid = false;
+          break;
+        }
+        if (it->second <= peak_time) {
           continue;
         }
+
+        if (skip_list->find(input.node->name()) != skip_list->end()) {
+          valid = false;
+          break;
+        }
         string input_name =
             strings::StrCat(input.node->name(), ":", input.port_id);
         if (skip_list->find(input_name) != skip_list->end()) {
-          continue;
-        }
-        if (!IsSwappable(input)) {
-          continue;
-        }
-        auto it = execution_times.find(input.node->name());
-        if (it != execution_times.end()) {
-          if (it->second > execution_time) {
-            fanout_to_swap = input;
-            execution_time = it->second;
-          }
-        }
-      }
-      // Annotate the fanout to request the tensor to be swapped if it's not
-      // already been done.
-      bool found = false;
-      if (!fanout_to_swap.node) {
-        continue;
-      }
-      auto it = fanout_to_swap.node->attr().find("_swap_to_host");
-      if (it != fanout_to_swap.node->attr().end()) {
-        const AttrValue& val = it->second;
-        for (int port_id : val.list().i()) {
-          if (port_id == fanout_to_swap.port_id) {
-            found = true;
-            break;
-          }
-        }
-      }
-      if (!found) {
-        (*nodes_to_swap)[fanout_to_swap.node].inputs_to_swap.push_back(
-            fanout_to_swap.port_id);
-        required_savings -= live_tensor.memory_used;
-        updated_graph = true;
-        if (required_savings < 0) {
+          valid = false;
           break;
         }
+        if (!IsSwappable(input)) {
+          valid = false;
+          break;
+        }
+
+        // Set earliest use time that's after peak.
+        mem_info.uses_left.emplace_back(input);
+        earliest_use = std::min(earliest_use, it->second);
+      }
+      if (valid && !mem_info.uses_left.empty()) {
+        // Compute the fitness: we need the tensor to be generated way away of
+        // the time of peak memory usage (to ensure there is enough time to swap
+        // it out). We also need to ensure it's used way after the peak time, to
+        // ensure that swapping the tensor back in won't recreate the memory
+        // bottleneck. Last but not least, we want the tensor to have as few
+        // remaining uses as possible.
+        mem_info.fitness = std::pow((earliest_use - peak_time).count(), 2);
+        mem_info.fitness /= std::pow(mem_info.uses_left.size(), 2);
+        mem_info.fitness += std::pow((allocation_time - peak_time).count(), 2);
+        mem_info.fitness = -mem_info.fitness;
+        mem_state.push_back(mem_info);
+      }
+    }
+
+    // Sort by fitness
+    std::sort(mem_state.begin(), mem_state.end());
+
+    for (const MemInfo& mem_info : mem_state) {
+      for (const GraphView::InputPort fanout_to_swap : mem_info.uses_left) {
+        VLOG(1) << "Will swap fanout " << fanout_to_swap.node->name() << ":"
+                << fanout_to_swap.port_id << " of tensor "
+                << mem_info.port.node->name() << ":" << mem_info.port.port_id
+                << " of size " << mem_info.memory_used;
+
+        (*nodes_to_swap)[fanout_to_swap.node].inputs_to_swap.push_back(
+            fanout_to_swap.port_id);
+      }
+      required_savings -= mem_info.memory_used;
+      updated_graph = true;
+      if (required_savings < 0) {
+        break;
       }
     }
   }
-
   return updated_graph;
 }
 
@@ -1011,7 +1123,7 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
   }
   for (auto& swap : nodes_to_swap) {
     const NodeDef* node = swap.first;
-    std::vector<OpInfo::TensorProperties> props =
+    const std::vector<OpInfo::TensorProperties>& props =
         properties.GetInputProperties(node->name());
     SwapInfo& swap_info = swap.second;
     int64 bytes_to_swap = 0;
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index dd2d20d8d68..f5d9c879926 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -337,8 +337,9 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
   for (const auto& node : output.node()) {
     if (node.name() == "e") {
       // The d node isn't swappable.
-      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ(5, node.input_size());
       EXPECT_EQ("d", node.input(2));
+      EXPECT_EQ("^swap_out_d_2", node.input(4));
     }
   }
 }
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 534f7a063fe..137d51790d3 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -99,3 +99,29 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "traversal",
+    srcs = ["traversal.cc"],
+    hdrs = ["traversal.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "traversal_test",
+    srcs = ["traversal_test.cc"],
+    deps = [
+        ":traversal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
new file mode 100644
index 00000000000..f44f53c4e63
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge) {
+  // Stack of work to do.
+  struct StackElem {
+    NodeDef* node;
+    bool children_visited;
+    NodeDef* src;
+  };
+  std::vector<StackElem> stack;
+
+  stack.reserve(from.size());
+  for (NodeDef* node : from) {
+    stack.push_back(StackElem{node, false});
+  }
+
+  enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
+  std::unordered_map<NodeDef*, NodeState> node_state;
+  while (!stack.empty()) {
+    StackElem w = stack.back();
+    stack.pop_back();
+
+    if (w.children_visited) {
+      // We've processed all the children of this node
+      node_state[w.node] = DONE;
+      if (post_order) {
+        post_order(w.node);
+      }
+      continue;
+    }
+
+    auto& rslt = node_state[w.node];
+    if (rslt == DONE) {
+      continue;
+    } else if (rslt == VISITING) {
+      // Loop detected
+      if (on_back_edge) {
+        on_back_edge(w.src, w.node);
+      }
+      continue;
+    }
+    rslt = VISITING;
+    if (pre_order) {
+      pre_order(w.node);
+    }
+
+    // Enqueue the node again with the children_visited flag set to true.
+    stack.push_back(StackElem{w.node, true, w.src});
+
+    // Now enqueu the node children.
+    for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
+      stack.push_back(StackElem{fanin.node, false, w.node});
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
new file mode 100644
index 00000000000..bb3fa090e8f
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+
+#include <functional>
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Traverse the graph in reverse dfs order, starting from the list of nodes
+// specified in the 'from' argument. The pre_order and post_order functors will
+// be called on each reachable node (including the 'from' nodes) in pre and post
+// order. If loops are found, the on_back_edge functor will be called on the
+// corresponding back edges. Moreover, the pre and post order will assume that
+// these back edges will be cut.
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
new file mode 100644
index 00000000000..cc68bd1a963
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+//#include "tensorflow/core/framework/node_def.pb.h"
+//#include "tensorflow/core/lib/core/status_test_util.h"
+//#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class TraversalTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "", inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(TraversalTest, ReverseDfsNoLoop) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(1),
+                                       graph.mutable_node(2)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  bool found_back_edge = false;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&found_back_edge](NodeDef*, NodeDef*) { found_back_edge = true; });
+
+  EXPECT_EQ(std::vector<string>({"1", "4", "3", "2", "5", "0"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"4", "5", "2", "3", "1", "0"}), post_order);
+  EXPECT_FALSE(found_back_edge);
+}
+
+TEST_F(TraversalTest, ReverseDfsWithLoop) {
+  GraphDef graph;
+  // Create a loop
+  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
+  *graph.add_node() = CreateNode("3", "Switch", {"2"});
+  *graph.add_node() = CreateNode("4", "Identity", {"3"});
+  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
+  *graph.add_node() = CreateNode("1", "Enter", {});
+  *graph.add_node() = CreateNode("6", "Exit", {"3"});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(5)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&back_edges](NodeDef* src, NodeDef* dst) {
+        back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
+      });
+
+  EXPECT_EQ(std::vector<string>({"6", "3", "2", "1", "5", "4"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"1", "4", "5", "2", "3", "6"}), post_order);
+  EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fd99409c9b3..e7192ec42fe 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -629,6 +629,7 @@ cc_library(
         ":transpose_op",
         ":unique_op",
         ":unpack_op",
+        ":unravel_index_op",
         ":where_op",
     ],
 )
@@ -883,6 +884,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":split_lib"],
 )
 
+tf_kernel_library(
+    name = "unravel_index_op",
+    prefix = "unravel_index_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "where_op",
     srcs = ["where_op.cc"],
@@ -2582,6 +2589,45 @@ tf_cc_tests(
     ],
 )
 
+cc_library(
+    name = "manip",
+    deps = [
+        ":roll_op",
+    ],
+)
+
+MANIP_DEPS = [
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:manip_ops_op_lib",
+    "//third_party/eigen3",
+]
+
+tf_kernel_library(
+    name = "roll_op",
+    prefix = "roll_op",
+    deps = MANIP_DEPS,
+)
+
+tf_cc_test(
+    name = "roll_op_test",
+    size = "small",
+    srcs = ["roll_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":roll_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 MATH_DEPS = [
     ":bounds_check",
     ":fill_functor",
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 37976f71837..72155fd0373 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 template <typename Device, typename T>
 class AdjustContrastOp : public OpKernel {
  public:
-  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit AdjustContrastOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
index 0fc03b5a236..7522b320400 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -29,8 +29,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-class AdjustContrastOpTest : public OpsTestBase {
-};
+class AdjustContrastOpTest : public OpsTestBase {};
 
 TEST_F(AdjustContrastOpTest, Simple_1113) {
   TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 4643d4e6efd..f0c6ae499d4 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -192,8 +192,9 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
-                               int64 start_channel, int64 end_channel) {
+          kCostPerChannel,
+          [channel_count, &input_data, &output_data, scale_h](
+              int64 start_channel, int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index dfa3fe585e3..aa1cead928a 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -201,7 +201,7 @@ struct Add7Functor<SYCLDevice, T> {
                   typename TTypes<T>::ConstFlat in6,
                   typename TTypes<T>::ConstFlat in7) {
     Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7);
+                                          in7);
   }
 };
 
@@ -214,7 +214,7 @@ struct Add8Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8);
+                                          in7, in8);
   }
 };
 
@@ -227,7 +227,7 @@ struct Add8pFunctor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
+                                           in7, in8);
   }
 };
 
@@ -241,10 +241,10 @@ struct Add9Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
       typename TTypes<T>::ConstFlat in9) {
     Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8, in9);
+                                          in7, in8, in9);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index cc8f122cab3..ce2fce92e4e 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel {
     const int64 batch_size = input_shape.dim_size(0);
 
     const Tensor& window_size = context->input(1);
-    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
-                             window_size.shape().dim_size(0) == 2,
+    OP_REQUIRES(context,
+                (window_size.shape().dims() == 1) &&
+                    window_size.shape().dim_size(0) == 2,
                 errors::InvalidArgument(
                     "input must be a vector of size 2 (height, width)",
                     window_size.shape().DebugString()));
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
index dea2683184a..f5e81dbc093 100644
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -48,9 +48,8 @@ struct SpatialAvgPooling {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
-// that works NHWC data formats.
-// Arguments:
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
+// operation that works NHWC data formats. Arguments:
 //   top_diff: backprop to the output of the pooling layer
 //   num: number of input batches
 //   height: input height
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 2be330d1427..6537b42f1ed 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         int pool_size = (hend - hstart) * (wend - wstart);
-        gradient +=
-            top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
+                    dtype(pool_size);
       }
     }
     bottom_diff[index] = gradient;
@@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                             const GPUDevice& d) {
   int x_size = num * height * width * channels;
   CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      config.virtual_thread_count, top_diff, num, height, width, channels,
-      pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
-      pad_t, pad_t, bottom_diff);
+  AvePoolBackwardNHWC<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          config.virtual_thread_count, top_diff, num, height, width, channels,
+          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
+          pad_t, pad_t, bottom_diff);
 
   return d.ok();
 }
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index d0bbea9fe27..944564dfba6 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
       mutex_lock lock(mu_);
       if (closed_) {
         OP_REQUIRES_ASYNC(
-            ctx, !cancel_pending_enqueues_ &&
-                     (num_inserted == 0 || !incomplete_.empty()),
+            ctx,
+            !cancel_pending_enqueues_ &&
+                (num_inserted == 0 || !incomplete_.empty()),
             errors::Cancelled(
                 "Barrier ", name_, " is closed.  Pending enqueues cancelled: ",
-                cancel_pending_enqueues_, ".  Number of new insertions: ",
-                num_inserted, ".  Number of incomplete keys: ",
-                incomplete_.size(), "."),
+                cancel_pending_enqueues_,
+                ".  Number of new insertions: ", num_inserted,
+                ".  Number of incomplete keys: ", incomplete_.size(), "."),
             callback);
       }
 
@@ -128,9 +129,10 @@ class Barrier : public ResourceBase {
 
       for (int i = 0; i < num_inserted; ++i) {
         OP_REQUIRES_OK_ASYNC(
-            ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
-                                    component_index, i, &ready_tuples,
-                                    &new_elements),
+            ctx,
+            InsertOneLocked<T>(ctx, keys, values, element_shape,
+                               component_index, i, &ready_tuples,
+                               &new_elements),
             callback);
       }
 
@@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
         return errors::Cancelled(
             "Barrier ", name_,
             " is closed, but attempted to insert a brand new key: ",
-            keys_vec(i), ".  Pending enqueues cancelled: ",
-            cancel_pending_enqueues_, ".  Insertion index: ", i,
+            keys_vec(i),
+            ".  Pending enqueues cancelled: ", cancel_pending_enqueues_,
+            ".  Insertion index: ", i,
             ".  Number of incomplete keys: ", incomplete_.size(), ".");
       }
     } else {
@@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel {
     OP_REQUIRES_ASYNC(
         ctx, component_index_ < barrier->num_components(),
         errors::InvalidArgument("The component ID is out of range ",
-                                component_index_, " > num_components", " (= ",
-                                barrier->num_components(), ")"),
+                                component_index_, " > num_components",
+                                " (= ", barrier->num_components(), ")"),
         callback);
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
-                                  barrier->component_type(component_index_)},
-                                 {}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_STRING,
+                             barrier->component_type(component_index_)},
+                            {}),
         callback);
 
     const Tensor* keys;
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 5b4e1a809fa..c447db842d3 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,22 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 93c39183198..43e716c542a 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -429,14 +429,13 @@ template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-
-  // Number of matrix multiplies i.e. size of the batch.
-  const int64 batch_size = in_x.dim_size(0);
-  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
-                                        0, batch_size);
+    // Number of matrix multiplies i.e. size of the batch.
+    const int64 batch_size = in_x.dim_size(0);
+    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
+                                          out, 0, batch_size);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename Scalar>
 class BatchMatMul : public OpKernel {
@@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel {
     TensorShape out_shape;
     for (int i = 0; i < ndims - 2; ++i) {
       OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
-                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
-                                          i, ") must be the same: ",
-                                          in0.shape().DebugString(), " vs ",
-                                          in1.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "In[0].dim(", i, ") and In[1].dim(", i,
+                      ") must be the same: ", in0.shape().DebugString(), " vs ",
+                      in1.shape().DebugString()));
       out_shape.AddDim(in0.dim_size(i));
     }
     auto n = (ndims == 2) ? 1 : out_shape.num_elements();
@@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel {
   bool adj_y_;
 };
 
-#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<CPUDevice, TYPE>)
 
-#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<GPUDevice, TYPE>)
@@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                               \
       Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
       BatchMatMul<SYCLDevice, TYPE>)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 8d155ca62b2..7e1e2aa4ec1 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc
index 7923f34155b..c3932cd7b90 100644
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_test.cc
@@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
 /* Uncomment to enable benchmarks for double & complex types: */
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu);                    \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);  \
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu);                    \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
+// \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
 
 // Typical fully connected layers
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index d3ed617f713..c34ea14bf60 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index 5e3fcd2114a..45ddc853295 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) {
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
   test::FillValues<float>(
       &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 7f2df95e2d5..1a45212ad29 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
 namespace tensorflow {
 namespace batch_util {
 
@@ -61,6 +63,21 @@ Status HandleElementToSlice<string>(Tensor element, Tensor* parent, int64 index,
   return Status::OK();
 }
 
+template <>
+Status HandleElementToSlice<Variant>(Tensor element, Tensor* parent,
+                                     int64 index, bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<Variant>();
+  auto element_flat = element.flat<Variant>();
+  if (can_move) {
+    for (int64 i = 0; i < element.NumElements(); ++i) {
+      parent_as_matrix(index, i) = std::move(element_flat(i));
+    }
+  } else {
+    parent_as_matrix.chip(index, 0) = element_flat;
+  }
+  return Status::OK();
+}
+
 // TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
 // the data when possible.
 
@@ -87,7 +104,6 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_variant(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -107,7 +123,6 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_variant(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
@@ -115,5 +130,101 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+// The following five functions are copied from padding_fifo_queue.cc.
+// TODO(mrry): Reconcile these functions with the similar methods in the
+// queue implementation.
+Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
+  DCHECK_NE(parent->dim_size(0), 0);
+  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
+    TensorShape chip_shape = parent->shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
+        "element is greater than number of elements in parent slice.  ",
+        "Shapes are: [element]: ", element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T, int NDIMS>
+Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                  int index) {
+  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
+  if (element.NumElements() == 0) {
+    return Status::OK();
+  }
+  auto element_t = element.tensor<T, NDIMS>();
+  auto parent_t = parent->tensor<T, NDIMS + 1>();
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
+  slice_indices[0] = index;
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
+  slice_size[0] = 1;
+  for (size_t i = 1; i < slice_size.size(); ++i) {
+    slice_size[i] = element_t.dimension(i - 1);
+  }
+  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
+  return Status::OK();
+}
+
+template <int NDIMS>
+Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
+                                          int index) {
+#define HANDLE_TYPE(T)                                                   \
+  case DataTypeToEnum<T>::value: {                                       \
+    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "HandleElementToLargerSliceWithRank Unhandled data type: ",
+          element.dtype());
+  }
+}
+
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index) {
+  if (parent->dims() != element.dims() + 1) {
+    return errors::Internal(
+        "Mismatched ranks.  Element's rank is: ", element.dims(),
+        " but element is meant to be a slice in output Tensor having rank: ",
+        parent->dims(), " (should be: ", element.dims() + 1, ")");
+  }
+
+#define HANDLE_DIMS(NDIMS)                                                  \
+  case NDIMS: {                                                             \
+    TF_RETURN_IF_ERROR(                                                     \
+        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
+    return Status::OK();                                                    \
+  }
+
+  switch (element.dims()) {
+    HANDLE_DIMS(0);
+    HANDLE_DIMS(1);
+    HANDLE_DIMS(2);
+    HANDLE_DIMS(3);
+    HANDLE_DIMS(4);
+#undef HANDLE_DIMS
+    default:
+      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
+                                   element.dims());
+  }
+}
+
+Status SetElementZero(Tensor* element, const Tensor& padding) {
+#define HANDLE_TYPE(T)                                     \
+  if (element->dtype() == DataTypeToEnum<T>::value) {      \
+    element->flat<T>().setConstant(padding.scalar<T>()()); \
+    return Status::OK();                                   \
+  }
+  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("SetElementZero Unhandled data type: ",
+                               element->dtype());
+}
+
 }  // namespace batch_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index 0d634ae7b07..a47bf1935db 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -32,6 +32,16 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Zero-initializes the tensor `element` using the scalar stored in `padding`.
+// Both `element` and `padding` must have matching `dtype`.
+Status SetElementZero(Tensor* element, const Tensor& padding);
+
+// Copies `element` into a (0th dimension) slice of `parent`, assuming
+// the shape of `element` is strictly not larger along any axis than a
+// slice.
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index);
+
 }  // namespace batch_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index dbf1733dcc3..36a4019002a 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -114,7 +114,7 @@ class PeriodicFunction {
   void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_);
 
   const std::function<void()> function_;  // Actual client function
-  const int64 interval_micros_;    // Interval between calls.
+  const int64 interval_micros_;           // Interval between calls.
   const Options options_;
 
   // Protects state below.
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index d73dcf0fa0e..d5ea2b648f3 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -55,15 +55,14 @@ Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
 // use the clock to be destroyed.
 std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
     test_util::FakeClockEnv* env, Notification* start, Notification* stop) {
-  return std::unique_ptr<Thread>(
-      Env::Default()->StartThread({}, "FakeClockAdvancerThread",
-                                  [env, start, stop] {
-                                    start->WaitForNotification();
-                                    while (!stop->HasBeenNotified()) {
-                                      env->AdvanceByMicroseconds(10);
-                                      Env::Default()->SleepForMicroseconds(10);
-                                    }
-                                  }));
+  return std::unique_ptr<Thread>(Env::Default()->StartThread(
+      {}, "FakeClockAdvancerThread", [env, start, stop] {
+        start->WaitForNotification();
+        while (!stop->HasBeenNotified()) {
+          env->AdvanceByMicroseconds(10);
+          Env::Default()->SleepForMicroseconds(10);
+        }
+      }));
 }
 
 TEST(SharedBatchSchedulerTest, Basic) {
@@ -258,7 +257,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeout) {
 TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) {
   Notification first_batch_processed, second_batch_processed;
   auto callback = [&first_batch_processed, &second_batch_processed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                      std::unique_ptr<Batch<FakeTask>> batch) {
     ASSERT_TRUE(batch->IsClosed());
     if (batch->size() == 1) {
       first_batch_processed.Notify();
@@ -301,7 +300,7 @@ TEST(SharedBatchSchedulerTest,
   {
     Notification first_batch_processed, second_batch_processed;
     auto callback = [&first_batch_processed, &second_batch_processed](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                        std::unique_ptr<Batch<FakeTask>> batch) {
       ASSERT_TRUE(batch->IsClosed());
       if (batch->size() == 1) {
         first_batch_processed.Notify();
@@ -349,7 +348,7 @@ TEST(SharedBatchSchedulerTest, Fairness) {
     auto queue_0_callback = [&queue_0_first_batch_scheduled,
                              &queue_0_first_batch_proceed,
                              &queue_0_second_batch_scheduled](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                                std::unique_ptr<Batch<FakeTask>> batch) {
       if (!queue_0_first_batch_scheduled.HasBeenNotified()) {
         queue_0_first_batch_scheduled.Notify();
         queue_0_first_batch_proceed.WaitForNotification();
@@ -467,7 +466,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
 TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) {
   Notification queue_0_processing, queue_0_proceed;
   auto queue_0_callback = [&queue_0_processing, &queue_0_proceed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                              std::unique_ptr<Batch<FakeTask>> batch) {
     if (!queue_0_processing.HasBeenNotified()) {
       queue_0_processing.Notify();
       queue_0_proceed.WaitForNotification();
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index c1c0d6d3292..b07c5fd718d 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
-                           block_dims == orig_crops.dim_size(0) &&
-                           2 == orig_crops.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+                  block_dims == orig_crops.dim_size(0) &&
+                  2 == orig_crops.dim_size(1),
               errors::InvalidArgument("crops should have shape [", block_dims,
                                       ", 2] instead of ",
                                       orig_crops.shape().DebugString()));
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index 7fc4b1762d0..8e4f08e4730 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 2ca194a77f2..754b93b073a 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -77,14 +77,14 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
   }
   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
   if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size);
+    BiasNHWCKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size);
   } else {
-    BiasNCHWKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size,
-        image_size);
+    BiasNCHWKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size,
+            image_size);
   }
 }
 
@@ -206,10 +206,10 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
   // Check if we have enough shared memory.
   if (shared_memory_size <= max_shared_memory_size) {
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, shared_memory_size,
-               d.stream()>>>(total_count, output_backprop, bias_backprop,
-                             bias_size);
+      BiasGradNHWC_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, shared_memory_size,
+             d.stream()>>>(total_count, output_backprop, bias_backprop,
+                           bias_size);
     } else {
       // Round up the block count to multiple of bias_size.
       int group_size = (config.block_count + bias_size - 1) / bias_size;
@@ -217,23 +217,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
       if (config.thread_per_block < kWarpSize) {
         config.thread_per_block = kWarpSize;
       }
-      BiasGradNCHW_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          output_backprop, bias_backprop, batch, bias_size, image_size,
-          group_size);
+      BiasGradNCHW_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              output_backprop, bias_backprop, batch, bias_size, image_size,
+              group_size);
     }
   } else {
     // Note that even if we don't have enough shared memory to fit the entire
     // output block, it is possible to process one group of elements at a time.
     // But for now, we simply fall back to the naive implementation.
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size);
+      BiasGradNHWC_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size);
     } else {
-      BiasGradNCHW_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size, image_size);
+      BiasGradNCHW_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size,
+              image_size);
     }
   }
 }
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h
index e35f42ad417..c8c60c55241 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
   auto *to_x = reinterpret_cast<const volatile T *>(&x);
   return *to_x;
 }
-}  // namespace tensorflow::internal
+}  // namespace internal
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index e937c4f11ba..654d99301af 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
                         SimpleCandidateSamplerOp<LogUniformSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<UnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<UnigramSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
 
 class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
  public:
@@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& in_true_candidates = context->input(0);
     const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
-                             in_true_candidates_shape.dim_size(1) == num_true_,
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+                    in_true_candidates_shape.dim_size(1) == num_true_,
                 errors::InvalidArgument(
                     "true_candidates must be a batch_size * num_true matrix"));
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index f16abb2b79f..626db9131ae 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase {
   }
 };
 
-#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                          \
-                              .TypeConstraint<srctype>("SrcT")  \
-                              .TypeConstraint<dsttype>("DstT")  \
-                              .Device(DEVICE_SYCL),             \
+#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
+                              .TypeConstraint<srctype>("SrcT") \
+                              .TypeConstraint<dsttype>("DstT") \
+                              .Device(DEVICE_SYCL),            \
                           SyclCastOp)
 CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
 CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
@@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double);
 
 #undef REGISTER_CAST_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef CURRY_TYPES2
 
@@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
     CpuCastOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 8fedf2c271c..fd4e75d26f0 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
     p[0] = a.value;
     p[1] = 0;
 #else
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
+    static_assert(::tensorflow::port::kLittleEndian,
+                  "Not a little endian system!");
     p[0] = 0;
     p[1] = a.value;
 #endif
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 470e9e08041..3ae9f2ab4d9 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -41,25 +41,25 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
     o.device(d) = i.template cast<O>();
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
-#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1)   \
-  FN(arg0, arg1, bool);                        \
-  FN(arg0, arg1, uint8);                       \
-  FN(arg0, arg1, int8);                        \
-  FN(arg0, arg1, uint16);                      \
-  FN(arg0, arg1, int16);                       \
-  FN(arg0, arg1, int32);                       \
-  FN(arg0, arg1, int64);                       \
-  FN(arg0, arg1, float);                       \
-  FN(arg0, arg1, double);                      \
-  FN(arg0, arg1, std::complex<float>);         \
+#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
+  FN(arg0, arg1, bool);                      \
+  FN(arg0, arg1, uint8);                     \
+  FN(arg0, arg1, int8);                      \
+  FN(arg0, arg1, uint16);                    \
+  FN(arg0, arg1, int16);                     \
+  FN(arg0, arg1, int32);                     \
+  FN(arg0, arg1, int64);                     \
+  FN(arg0, arg1, float);                     \
+  FN(arg0, arg1, double);                    \
+  FN(arg0, arg1, std::complex<float>);       \
   FN(arg0, arg1, std::complex<double>)
 
-#define CURRY_TYPES3(FN, arg0, arg1)           \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)         \
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
   FN(arg0, arg1, Eigen::half);
 
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index a106f287c18..057e209a719 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index ba100b32e7d..9cc2e67bbe1 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<CPUDevice, T>);          \
-  template class RGBToHSVOp<CPUDevice, T>;                    \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<CPUDevice, T>);          \
+#define REGISTER_CPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<CPUDevice, T>);                                  \
+  template class RGBToHSVOp<CPUDevice, T>;                        \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<CPUDevice, T>);                                  \
   template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
@@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU);
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-#define DECLARE_GPU(T)                                        \
-  template <>                                                 \
-  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 1>::Tensor range,                             \
-      TTypes<T, 2>::Tensor output_data);                      \
-  extern template struct RGBToHSV<GPUDevice, T>;              \
-  template <>                                                 \
-  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 2>::Tensor output_data);                      \
+#define DECLARE_GPU(T)                                               \
+  template <>                                                        \
+  void RGBToHSV<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
+  extern template struct RGBToHSV<GPUDevice, T>;                     \
+  template <>                                                        \
+  void HSVToRGB<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 2>::Tensor output_data);                             \
   extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-#define REGISTER_GPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<GPUDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<GPUDevice, T>);
+#define REGISTER_GPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          RGBToHSVOp<SYCLDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          HSVToRGBOp<SYCLDevice, T>);
+#define REGISTER_SYCL(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      RGBToHSVOp<SYCLDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      HSVToRGBOp<SYCLDevice, T>);
 TF_CALL_float(REGISTER_SYCL);
 TF_CALL_double(REGISTER_SYCL);
 #endif
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h
index c5721ef6dd0..90bfce14194 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@@ -54,10 +54,9 @@ struct RGBToHSV {
     // TODO(wicke): all these assignments are only necessary because a combined
     // expression is larger than kernel parameter space. A custom kernel is
     // probably in order.
-    H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(
-                                      norm * (B - R) + T(2) / T(6),
-                                      norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (R == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
+                                        norm * (R - G) + T(4) / T(6)));
     H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
     H.device(d) = (H < T(0)).select(H + T(1), H);
   }
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
index e19d0b14d5d..61f9ba44c46 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/colorspace_op.h"
 
 namespace tensorflow {
 
@@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
-}
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 8c6fb732abf..bd82826770f 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase {
   }
 };
 
-#define TEST_COLORSPACE(test, dt)                               \
-  TEST_F(test, CheckBlack) {                                    \
-    MakeOp(dt);                                                 \
-    CheckBlack(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckGray) {                                     \
-    MakeOp(dt);                                                 \
-    CheckGray(dt);                                              \
-  }                                                             \
-  TEST_F(test, CheckWhite) {                                    \
-    MakeOp(dt);                                                 \
-    CheckWhite(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckRedMax) {                                   \
-    MakeOp(dt);                                                 \
-    CheckRedMax(dt);                                            \
-  }                                                             \
-  TEST_F(test, CheckGreenMax) {                                 \
-    MakeOp(dt);                                                 \
-    CheckGreenMax(dt);                                          \
-  }                                                             \
-  TEST_F(test, CheckBlueMax) {                                  \
-    MakeOp(dt);                                                 \
-    CheckBlueMax(dt);                                           \
-  }                                                             \
-  TEST_F(test, CheckNegativeDifference) {                       \
-    MakeOp(dt);                                                 \
-    CheckNegativeDifference(dt);                                \
+#define TEST_COLORSPACE(test, dt)         \
+  TEST_F(test, CheckBlack) {              \
+    MakeOp(dt);                           \
+    CheckBlack(dt);                       \
+  }                                       \
+  TEST_F(test, CheckGray) {               \
+    MakeOp(dt);                           \
+    CheckGray(dt);                        \
+  }                                       \
+  TEST_F(test, CheckWhite) {              \
+    MakeOp(dt);                           \
+    CheckWhite(dt);                       \
+  }                                       \
+  TEST_F(test, CheckRedMax) {             \
+    MakeOp(dt);                           \
+    CheckRedMax(dt);                      \
+  }                                       \
+  TEST_F(test, CheckGreenMax) {           \
+    MakeOp(dt);                           \
+    CheckGreenMax(dt);                    \
+  }                                       \
+  TEST_F(test, CheckBlueMax) {            \
+    MakeOp(dt);                           \
+    CheckBlueMax(dt);                     \
+  }                                       \
+  TEST_F(test, CheckNegativeDifference) { \
+    MakeOp(dt);                           \
+    CheckNegativeDifference(dt);          \
   }
 
 typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
index 9f626a274a4..39e4f24ed51 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -110,7 +110,20 @@ struct ComputeShard<T,
       typename TTypes<bool>::ConstMatrix input,
       typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
       int64 limit) {
-    // NOTE(ebrevdo): This assumes memory is little-endian.
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+      *out =
+          ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) |
+           (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) |
+           (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) |
+           (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) |
+           (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) |
+           (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) |
+           (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL)))));
+    }
+#else
     for (int64 i = start; i < limit; ++i) {
       uint8* out = output.data() + i;
       const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
@@ -123,6 +136,7 @@ struct ComputeShard<T,
            (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
            (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
     }
+#endif
   }
 };
 
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 526f9420d72..16784c4770e 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -41,10 +41,11 @@ namespace tensorflow {
 
 // Assumes all inputs are nonempty
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
 #if GOOGLE_CUDA
 template <typename T>
 void ConcatGPU(
@@ -57,11 +58,12 @@ void ConcatGPU(
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
-#endif // TENSORFLOW_USE_SYCL
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 43731114c0b..547a7b40b92 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -48,10 +48,11 @@ struct MemCpyCopier<ResourceHandle> {
 }  // namespace
 
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   if (std::is_same<T, string>::value) {
     // use a large cost here to force strings to be handled by separate threads
     ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
@@ -72,7 +73,6 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-TF_CALL_variant(REGISTER)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
@@ -86,21 +86,22 @@ TF_CALL_variant(REGISTER)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                   output);
+                    output);
 }
-#define REGISTER_SYCL(T)                                                      \
- template void ConcatSYCL<T>(                                                 \
-     const Eigen::SyclDevice&,                                                \
-     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-     typename TTypes<T, 2>::Matrix* output);
+#define REGISTER_SYCL(T)                                                       \
+  template void ConcatSYCL<T>(                                                 \
+      const Eigen::SyclDevice&,                                                \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+      typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 6a933efde4b..720b5065377 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -15,9 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/concat_lib.h"
 #include <vector>
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -73,7 +73,7 @@ void ConcatCPUImpl(
 
   // Sharded mode.
   auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
-      int64 start, int64 end) {
+                  int64 start, int64 end) {
     int64 skipped_rows = start / row_size;
     T* out = output->data() + skipped_rows * row_size;
     T* out_start = output->data() + start;
@@ -160,5 +160,5 @@ void ConcatSYCLImpl(
     }
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ae1b5da32ea..7011550f7e1 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel {
     const TensorShape& input_shape = values[0].shape();
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
-                       (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
           c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == axis) {
           continue;
@@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
             c, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -252,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                         ConcatV2Op<CPUDevice, int32>);
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class ConcatOffsetOp : public OpKernel {
  public:
@@ -347,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("shape")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index c5bded9dafc..e3ba8ae9f69 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
 
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
-                         Eigen::Unaligned> EigenMap;
+                         Eigen::Unaligned>
+    EigenMap;
 static void MemcpyManyAlternative1(int iters, int dim2) {
   testing::StopTiming();
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 794ac6fa6de..c7c7c983691 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -160,7 +160,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
  * Modifications to convenience macros defined in core/framework/op_kernel.h.
  * The below macros return a boolean if the test fails, so that the calling
  * function can get an indication that a failure has occurred.
-*/
+ */
 #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS)          \
   do {                                                 \
     if (!TF_PREDICT_TRUE(EXP)) {                       \
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index fa37916eaba..e13bf8a4c63 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp
                       ConditionalAccumulatorBase* accumulator,
                       DoneCallback callback) override {
     // Check signature
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                                  {accumulator->dtype()}),
-                         callback);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
+        callback);
   }
 
  private:
@@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                         AccumulatorTakeGradientOp);
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 59f9f69315e..920cd87858a 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -146,7 +146,6 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 8fe82d118a7..7d5d54e5bec 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                       \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("pred")        \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_SWITCH(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
 
-#define REGISTER_SYCL_REF_SWITCH(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("pred")          \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_REF_SWITCH(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
 #undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
 
-#define REGISTER_SYCL_HOST_KERNEL(type)                  \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("data")        \
-                              .HostMemory("pred")        \
-                              .HostMemory("output_false")\
-                              .HostMemory("output_true") \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_KERNEL(bool);
 REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(int32);
 
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("data")          \
-                              .HostMemory("pred")          \
-                              .HostMemory("output_false")  \
-                              .HostMemory("output_true")   \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
@@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
+#define REGISTER_SYCL_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(         \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
+#define REGISTER_SYCL_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(             \
       Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
@@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
-  REGISTER_KERNEL_BUILDER(                                                   \
+#define REGISTER_SYCL_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
+  REGISTER_KERNEL_BUILDER(                                                 \
       Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      NextIterationOp);                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNEL(type)                                            \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
+      NextIterationOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       NextIterationOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
@@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond")
                             .HostMemory("input")
                             .HostMemory("output"),
                         LoopCondOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // ControlTrigger kernels
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
@@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL),
                         ControlTriggerOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // When called, abort op will abort the current process. This can be used to
 // abort remote PSs when needed.
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index affa0e8ca6b..a2f7bd40692 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -91,6 +91,7 @@ class KilledBySignal {
  public:
   explicit KilledBySignal(int signum) : signum_(signum) {}
   bool operator()(int exit_status) const { return exit_status == signum_; }
+
  private:
   const int signum_;
 };
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 985586d6262..dbddaf3dc64 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -688,7 +688,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
-      );
+  );
 
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 291ebf22987..1b40ad81f41 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
 
     const int dims = resized_shape.dims();
     OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
-                     paddings.dim_size(1) == 2,
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 paddings.shape().DebugString()));
     const int fixed_dims =
@@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
       const int32 after =
           paddings_matrix(d, 1);  // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(
-            context, before <= resized_shape.dim_size(d) &&
-                         after <= resized_shape.dim_size(d),
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be no greater "
                                     "than the dimension size: ",
                                     before, ", ", after, " greater than ",
                                     resized_shape.dim_size(d)));
       } else if (offset_ == 1) {  // REFLECT mode.
         OP_REQUIRES(
-            context, before < resized_shape.dim_size(d) &&
-                         after < resized_shape.dim_size(d),
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be less than"
                                     " the dimension size: ",
                                     before, ", ", after, " not less than ",
@@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // We only check the first three dims, since the depth is accessed as an
     // int64 below.
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int padded_rows = static_cast<int>(padded_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
     const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
@@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int padded_cols = static_cast<int>(padded_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
     const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
@@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
 };
 
-#define REGISTER_FUSED(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedResizeAndPadConv2D")                                        \
-          .Device(DEVICE_CPU)                                                \
-          .TypeConstraint<T>("T"),                                           \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          BILINEAR>,                         \
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
           true>);
 
 TF_CALL_float(REGISTER_FUSED);
 
-#define REGISTER_PAD_ONLY_FUSED(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),      \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          NEAREST>,                          \
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
           false>);
 
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 57e196c67cf..f0085be3a53 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -27,7 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index af6013c9747..a376534badc 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 
@@ -252,11 +252,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   int x = threadIdx.x;
 
   Dimension<3> output_dims = {
-      input_dims[0], input_dims[2], input_dims[1],
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
   };
 
   Dimension<3> input_dims_in_tiles = {
-      input_dims[0], (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      input_dims[0],
+      (input_dims[1] + TileSizeI - 1) / TileSizeI,
       (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
   };
 
@@ -264,7 +267,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
       FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
 
   Index<3> input_tile_origin = {
-      input_tile_index[0], input_tile_index[1] * TileSizeI,
+      input_tile_index[0],
+      input_tile_index[1] * TileSizeI,
       input_tile_index[2] * TileSizeJ,
   };
 
@@ -322,11 +326,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   __syncthreads();
 
   Index<3> output_tile_index = {
-      input_tile_index[0], input_tile_index[2], input_tile_index[1],
+      input_tile_index[0],
+      input_tile_index[2],
+      input_tile_index[1],
   };
 
   Index<3> output_tile_origin = {
-      output_tile_index[0], output_tile_index[1] * TileSizeJ,
+      output_tile_index[0],
+      output_tile_index[1] * TileSizeJ,
       output_tile_index[2] * TileSizeI,
   };
 
@@ -641,8 +648,9 @@ struct BatchNarrowMatrixTransposeDispatcher {
     static_assert(
         (TileLongSide & (TileLongSide - 1)) == 0,
         "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
-                             min(tile_size_i, tile_size_j) <= TileShortSide;
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
 
     if (request_satisfied) {
       LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
@@ -655,7 +663,7 @@ struct BatchNarrowMatrixTransposeDispatcher {
     // determine whether it is the long side or the short side that falls short
     // of the request and increase that parameter accordingly.
     const bool long_side_request_not_satisfied =
-        max(tile_size_i, tile_size_j) > TileLongSide;
+        std::max(tile_size_i, tile_size_j) > TileLongSide;
 
     if (long_side_request_not_satisfied) {
       BatchNarrowMatrixTransposeDispatcher<
@@ -683,8 +691,9 @@ struct BatchNarrowMatrixTransposeDispatcher<
     static_assert(
         (TileLongSide & (TileLongSide - 1)) == 0,
         "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied = max(tile_size_i, tile_size_j) <= TileLongSide &&
-                             min(tile_size_i, tile_size_j) <= TileShortSide;
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
 
     if (request_satisfied) {
       LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
@@ -799,7 +808,7 @@ struct TransposeElemType<16> {
 // A helper function to make RunSwapDimension1And2InTensor3 concise. This
 // helper function looks at the data type and input matrix sizes and decides
 // the thread numbers and tile sizes to use.
-template <typename T, bool conjugate = false >
+template <typename T, bool conjugate = false>
 void SwapDimension1And2InTensor3WithNarrowMatrices(
     const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
     T* output, const int kMinDimensionToUseTiles) {
@@ -809,7 +818,7 @@ void SwapDimension1And2InTensor3WithNarrowMatrices(
   int tile_long_side_len = 0;
   int tile_short_side_len = 0;
   float lowest_cost = std::numeric_limits<float>::max();
-  int data_long_side = max(input_dims[1], input_dims[2]);
+  int data_long_side = std::max(input_dims[1], input_dims[2]);
 
   for (auto tile_size_pair : tile_spec) {
     int proposed_tile_long_side_len = tile_size_pair.first;
@@ -854,12 +863,14 @@ void SwapDimension1And2InTensor3WithNarrowMatrices(
   // Truncate the shorter size requested according to the manual limit set in
   // tile_spec to make sure that we do not launch configurations violating
   // hardware limits.
-  requested_tile_size_i = requested_tile_size_i == tile_long_side_len
-                              ? tile_long_side_len
-                              : min(requested_tile_size_i, tile_short_side_len);
-  requested_tile_size_j = requested_tile_size_j == tile_long_side_len
-                              ? tile_long_side_len
-                              : min(requested_tile_size_j, tile_short_side_len);
+  requested_tile_size_i =
+      requested_tile_size_i == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j =
+      requested_tile_size_j == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_j, tile_short_side_len);
 
   Dimension<3> input_dims_in_tiles = {
       input_dims[0],
@@ -902,19 +913,21 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
     constexpr int kNumThreads = 256;
 
     Dimension<3> input_dims_in_tiles = {
-        input_dims[0], MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        input_dims[0],
+        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
         MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
     };
 
     int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                             input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize, conjugate>
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
+                                          conjugate>
         <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
                                                             output);
 
   } else if (narrow_matrix) {
-    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(d, input, input_dims, output,
-                                                  kMinDimensionToUseTiles);
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
+        d, input, input_dims, output, kMinDimensionToUseTiles);
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 20da77c36f6..af0a9fa82ee 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
diff --git a/tensorflow/core/kernels/cross_op_gpu.cu.cc b/tensorflow/core/kernels/cross_op_gpu.cu.cc
index 7ea0b3be0ca..4a37f6cfbbc 100644
--- a/tensorflow/core/kernels/cross_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/cross_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/cross_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 73ee3106048..96bdb6a241b 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -80,16 +80,17 @@ class CTCDecodeHelper {
 
     if (!(batch_size == (*seq_len)->dim_size(0))) {
       return errors::FailedPrecondition(
-          "len(sequence_length) != batch_size.  ", "len(sequence_length):  ",
-          (*seq_len)->dim_size(0), " batch_size: ", batch_size);
+          "len(sequence_length) != batch_size.  ",
+          "len(sequence_length):  ", (*seq_len)->dim_size(0),
+          " batch_size: ", batch_size);
     }
 
     auto seq_len_t = (*seq_len)->vec<int32>();
 
     for (int b = 0; b < batch_size; ++b) {
       if (!(seq_len_t(b) <= max_time)) {
-        return errors::FailedPrecondition("sequence_length(", b, ") <= ",
-                                          max_time);
+        return errors::FailedPrecondition("sequence_length(", b,
+                                          ") <= ", max_time);
       }
     }
 
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb03adb7a53..b38d838bf1e 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
       const int64 batch_indices = g.group()[0];
       OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
                   errors::InvalidArgument("labels batch index must be between ",
-                                          0, " and ", batch_size, " but saw: ",
-                                          batch_indices));
+                                          0, " and ", batch_size,
+                                          " but saw: ", batch_indices));
 
       auto values = g.values<int32>();
       std::vector<int>* b_values = &labels_t[batch_indices];
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 5fd38d9dc25..1466f24202f 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 12cc6c8bdd4..49191226074 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 39c88140733..c2b355ab7f4 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 608a6dce3d2..bf32c8a54b3 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
-
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_KERNEL(type)                          \
   REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
@@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::add<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index ac21ca06c92..e8acbac2853 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          uint8, complex128, string);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
+          complex128, string);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index c28e27d95ae..fe8dfea1173 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index 0aec6aac344..7cf0405f524 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -1,10 +1,10 @@
-  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-  http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 7d73de48102..09f0448874f 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index 7b688db4c58..6170683fa64 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 0111e9d5fd1..816eadc80eb 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index d4b3b0e3935..71ad0ff0dc2 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index bca99a4f897..31b4bb3cadd 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Cosh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index d44c1bf473e..c71c756e446 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 66d7b7d22eb..8f4ac98016c 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -26,5 +26,5 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 4f723080060..ce03ad5de62 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 5a142b9ce9f..d554d41c412 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index fa81ef0872d..fecbf859897 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 55f8a30461f..29340b88506 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
index e7dff5d0ac5..77723b3169f 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-  DEFINE_UNARY1(conj, complex64);
-  DEFINE_UNARY1(conj, complex128);
+DEFINE_UNARY1(conj, complex64);
+DEFINE_UNARY1(conj, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
index 3675398126f..26748ef0e72 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
-               complex64, complex128, bool);
+                complex64, complex128, bool);
 DEFINE_APPROXIMATE_EQUAL2(float, double);
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index a54dbdfc247..627ecc8c802 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#define EIGEN_USE_GPU
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
 namespace tensorflow {
 namespace functor {
@@ -38,19 +40,17 @@ struct SelectScalarFunctor<GPUDevice, T> {
                   typename TTypes<bool>::ConstScalar cond,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
-
 #if !defined(EIGEN_HAS_INDEX_LIST)
-  Eigen::array<int, 1> rank1{1};
+    Eigen::array<int, 1> rank1{1};
 #else
-  Eigen::IndexList<Eigen::type2index<1>> rank1;
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
 #endif
-  const int size  = then_flat.dimension(0);
-  Eigen::array<int, 1> broadcast_dims{size};
-
-  To32Bit(out).device(d) = cond.reshape(rank1)
-                               .broadcast(broadcast_dims)
-                               .select(then_flat, else_flat);
+    const int size = then_flat.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
 
+    To32Bit(out).device(d) = cond.reshape(rank1)
+                                 .broadcast(broadcast_dims)
+                                 .select(then_flat, else_flat);
   }
 };
 
@@ -89,8 +89,8 @@ struct BatchSelectFunctor<GPUDevice, T> {
   }
 };
 
-#define SELECT_FUNCTOR(T)                      \
-  template struct SelectFunctor<GPUDevice, T>; \
+#define SELECT_FUNCTOR(T)                            \
+  template struct SelectFunctor<GPUDevice, T>;       \
   template struct SelectScalarFunctor<GPUDevice, T>; \
   template struct BatchSelectFunctor<GPUDevice, T>;
 
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index ba89899fb32..a4ea4088369 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 8f0c483aecd..3f34d6269ef 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
+          double);
 
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .Device(DEVICE_SYCL)
@@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index df2c02e42e1..f5cafcc7809 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -21,7 +21,7 @@ REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-         uint8, uint16);
+          uint8, uint16);
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 53ec1c1c63f..ae1e590d242 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 4b34744304f..f22ca21e1ca 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index ad2dd3f722c..aa180c247e7 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 136c3666dfc..00cdecdbd18 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97a2508d129..11806c5fc77 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 7fdfdff0e38..98936e0f960 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 25ad7b24bb1..162ca9e07cd 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 87d54e380b4..8c54f22f108 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 442171193bf..dff83df828f 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 023eb07ca3f..0e8d2e37350 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
-          uint8, int32);
+REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
+          int32);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
-           uint8);
+          uint8);
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc
index 7be5857cc06..6aa8f883640 100644
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@@ -22,11 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
-          int8, uint16, int16, int64, complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
+          complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
-           complex64, complex128);
+          complex64, complex128);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index 536891b548f..a136769b912 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 7bd81ee1271..02cd2987457 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-           double, uint8, int8, int16);
+          double, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 7d4ecec59f1..05bdea66367 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
 
 #endif  // GOOGLE_CUDA
 
-#endif   // !defined(__ANDROID_TYPES_SLIM__)
+#endif  // !defined(__ANDROID_TYPES_SLIM__)
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8c0e21f9cf3..aee25747b86 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 3dd9de8d897..e259daaba47 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
@@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double);
 REGISTER_SELECT_SYCL(int32);
 REGISTER_SELECT_SYCL(int64);
 #undef REGISTER_SELECT_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -201,13 +201,11 @@ struct SelectFunctorBase {
 };
 
 template <typename T>
-struct SelectFunctor<CPUDevice, T>
-        : SelectFunctorBase<CPUDevice, T> {};
+struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct SelectFunctor<SYCLDevice, T>
-        : SelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct SelectScalarFunctorBase {
@@ -222,12 +220,12 @@ struct SelectScalarFunctorBase {
 // CPU Specializations of Select functors with scalar
 template <typename T>
 struct SelectScalarFunctor<CPUDevice, T>
-        : SelectScalarFunctorBase<CPUDevice, T> {};
+    : SelectScalarFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
-        : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@@ -240,8 +238,8 @@ struct BatchSelectFunctorBase {
     const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1);
 
 #if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{ 1, all_but_batch }};
-    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{ batch, 1 }};
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{1, all_but_batch}};
+    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{batch, 1}};
 #else
     Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims;
     broadcast_dims.set(1, all_but_batch);
@@ -257,13 +255,13 @@ struct BatchSelectFunctorBase {
 };
 
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T>
-        : BatchSelectFunctorBase<CPUDevice, T> {};
+struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
+};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
-        : BatchSelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index a76a088ac8f..c132fdb63f2 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -35,6 +35,6 @@ REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index a4084d5ad17..02915ff4ce4 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -41,6 +41,6 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::sign<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index b91ff1ac30b..16c60578640 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 055f0b12e14..26b7a940aa8 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sinh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 00efbb00f15..497756133d0 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -36,5 +36,5 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 07a4b0b084d..7fc2f6bf08b 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::square<int32>>);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 6adaecba04b..025041946ac 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -53,5 +53,5 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::sub<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 7891b1183dd..c1a25767d31 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 8b3900892c3..c5005f5ea8a 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -26,7 +26,7 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index e561e59cf5a..980edffceb3 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -57,9 +57,9 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
       in1(ctx->input(1)),
       bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
   if (!bcast.IsValid()) {
-    ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
-                                           in0.shape().DebugString(), " vs. ",
-                                           in1.shape().DebugString()));
+    ctx->SetStatus(errors::InvalidArgument(
+        "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
+        in1.shape().DebugString()));
     return;
   }
   const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
index 43947707089..e81b840a509 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -50,16 +50,16 @@ struct SimpleBinaryFunctor<GPUDevice, Functor> {
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for SimpleBiaryFunctor (e.g., functor::tanh_grad).
-#define DEFINE_SIMPLE_BINARY1(F, T)                  \
+#define DEFINE_SIMPLE_BINARY1(F, T) \
   template struct SimpleBinaryFunctor<GPUDevice, F<T> >
-#define DEFINE_SIMPLE_BINARY2(F, T0, T1)             \
-  DEFINE_SIMPLE_BINARY1(F, T0);                      \
+#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \
+  DEFINE_SIMPLE_BINARY1(F, T0);          \
   DEFINE_SIMPLE_BINARY1(F, T1)
-#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2)         \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);          \
   DEFINE_SIMPLE_BINARY1(F, T2)
-#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3)     \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);              \
   DEFINE_SIMPLE_BINARY2(F, T2, T3)
 #define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \
   DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 77b330f5899..82cdae9a348 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
-
 #ifdef TENSORFLOW_USE_SYCL
 // Partial specialization of BinaryFunctor for SYCL devices
 typedef Eigen::SyclDevice SYCLDevice;
@@ -184,7 +183,7 @@ struct SimpleBinaryFunctor<SYCLDevice, Functor> {
   }
 };
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index 3f6ff7303d6..3e107cee04c 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
   void operator()(const SYCLDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in0,
                   typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
+    To32Bit(out).device(d) =
+        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
   }
 
   void Left(const SYCLDevice& d, typename Functor::tout_type out,
@@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary());
+    out.device(d) = scalar.reshape(scalar_dim)
+                        .broadcast(in.dimensions())
+                        .binaryExpr(in, Binary());
   }
 
   void Right(const SYCLDevice& d, typename Functor::tout_type out,
@@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
+    out.device(d) = in.binaryExpr(
+        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
   }
 
   void BCast(const SYCLDevice& d,
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index bca0f1004d5..39f497e7161 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -54,36 +54,36 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, float, DT_FLOAT);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, double, DT_DOUBLE);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 // data func scalar.
 Graph* BinaryScalar(int num, const string& func) {
@@ -113,18 +113,18 @@ Graph* BinaryScalar(int num, const string& func) {
 BM_BINARY_SCALAR(cpu, Less);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Less);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, Add);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -163,11 +163,11 @@ using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
@@ -217,15 +217,15 @@ using Eigen::half;
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
@@ -265,10 +265,10 @@ Graph* BcastAdd(int rows, int cols, int dim) {
 BM_BCAST_ADD_ROW_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_ROW_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -291,10 +291,10 @@ BM_BCAST_ADD_ROW_ALL(sycl);
 BM_BCAST_ADD_COL_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_COL_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 500ee7b43f2..c4e21257ffc 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -49,6 +49,7 @@ cc_library(
     srcs = ["dataset.cc"],
     hdrs = ["dataset.h"],
     deps = [
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -81,9 +82,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
@@ -122,6 +121,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -402,6 +402,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "tensor_queue_dataset_op",
+    srcs = ["tensor_queue_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
@@ -540,6 +553,7 @@ tf_kernel_library(
         ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
+        ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
         ":unique_dataset_op",
         ":zip_dataset_op",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 2d6e06398f6..7fa67efb9e2 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -92,7 +92,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
@@ -145,7 +144,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           const Tensor& first_element = batch_elements[0][component_index];
           TensorShape batch_component_shape({num_batch_elements});
           batch_component_shape.AppendShape(first_element.shape());
-          Tensor batch_component(cpu_allocator(), first_element.dtype(),
+          Tensor batch_component(ctx->allocator({}), first_element.dtype(),
                                  batch_component_shape);
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 1f6d32f8df3..f3e4f1cd3fd 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/notification.h"
 
-
 namespace tensorflow {
 
 /* static */
@@ -185,8 +184,7 @@ Status CapturedFunction::MaybeInstantiate(
   return Status::OK();
 }
 
-Status CapturedFunction::Run(IteratorContext* ctx,
-                             std::vector<Tensor>&& args,
+Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
                              std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle));
diff --git a/tensorflow/core/kernels/data/dataset.cc b/tensorflow/core/kernels/data/dataset.cc
index 2ea68755676..d18cb160189 100644
--- a/tensorflow/core/kernels/data/dataset.cc
+++ b/tensorflow/core/kernels/data/dataset.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 
@@ -264,6 +265,10 @@ void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
   MakeDataset(ctx, input, another_input, output);
 }
 
+Allocator* IteratorContext::allocator(AllocatorAttributes attrs) {
+  return params_.lib->device()->GetAllocator(attrs);
+}
+
 const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
index 2ef31ddfaaa..2c6fc8d5b4f 100644
--- a/tensorflow/core/kernels/data/dataset.h
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -15,595 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
 
-#include <memory>
-
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_tensor_data.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/tracing.h"
-
-// Polymorphic datasets should support all primitive TensorFlow
-// types. Use this macro to expand `m(T)` once for each primitive type
-// `T`, e.g. to build a `switch` statement.
-#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
-
-namespace tensorflow {
-
-// Interface for reading values from a key-value store.
-// Used for restoring iterator state.
-class IteratorStateReader {
- public:
-  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
-  virtual Status ReadScalar(StringPiece key, string* val) = 0;
-  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
-  virtual bool Contains(StringPiece key) = 0;
-
-  virtual ~IteratorStateReader() {}
-};
-
-// Interface for writing values to a key-value store.
-// Used for saving iterator state.
-class IteratorStateWriter {
- public:
-  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
-  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
-  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
-
-  virtual ~IteratorStateWriter() {}
-};
-
-// Forward declarations to avoid introducing a dependency on headers in
-// "tensorflow/core/graph/...".
-class GraphDefBuilder;
-class GraphDatasetBase;
-class Node;
-
-// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
-class GraphDefBuilderWrapper {
- public:
-  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
-
-  // Adds a Const node with scalar value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  template <typename T>
-  Status AddScalar(const T& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddScalar: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with vector value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
-  template <typename T>
-  Status AddVector(const std::vector<T>& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
-                          TensorShape({static_cast<int64>(val.size())}));
-    for (int i = 0; i < val.size(); i++) {
-      val_t.flat<T>()(i) = val[i];
-    }
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddVector: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with Tensor value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  Status AddTensor(const Tensor& val, Node** output) {
-    AddTensorInternal(val, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddTensor: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  Status AddDataset(const GraphDatasetBase* dataset,
-                    const std::vector<Node*>& inputs, Node** output) {
-    return AddDataset(dataset, inputs, {}, output);
-  }
-
-  // Adds a node corresponding to the `DatasetType` to the Graph.
-  // Return value of `DatasetType::op_name()` is used as the op type for the
-  // node.
-  // Values for the output_types and output_shapes node attributes are also
-  // written if those attributes are defined in the OpDef.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  Status AddDataset(const GraphDatasetBase* dataset,
-                    const std::vector<Node*>& inputs,
-                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-                    Node** output) {
-    std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
-      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
-    }
-    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
-  }
-
-  Status AddDataset(
-      const GraphDatasetBase* dataset,
-      const std::vector<std::pair<size_t, Node*>>& inputs,
-      const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
-      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-      Node** output);
-
-  // Adds a user-defined function with name `function_name` to the graph and
-  // recursively adds all functions it references. If a function with a matching
-  // name has already been added, returns with OK status. If a user-defined with
-  // name `function_name` is not found in the FunctionLibraryDefinition, returns
-  // an InvalidArgumentError. If the function with name `function_name` or any
-  // of its dependent functions are stateful, returns an InvalidArgument error.
-  Status AddFunction(OpKernelContext* ctx, const string& function_name);
-
-  template <typename T>
-  void BuildAttrValue(const T& value, AttrValue* attr) {
-    SetAttrValue(value, attr);
-  }
-
- private:
-  void AddTensorInternal(const Tensor& val, Node** output);
-
-  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
-                                   const string& function_name) const {
-    const FunctionLibraryDefinition* lib_def =
-        ctx->function_library()->GetFunctionLibraryDefinition();
-    const FunctionDef* function_def = lib_def->Find(function_name);
-    if (!function_def) {
-      return errors::InvalidArgument("Unable to find FunctionDef for ",
-                                     function_name, " in registry.");
-    }
-    for (const NodeDef& node_def : function_def->node_def()) {
-      const OpDef* op_def;
-      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
-      // TODO(b/65524810): Hack to allow functions to capture Dataset op
-      // nodes needed for FlatMap. Currently, source datasets nodes have been
-      // marked stateful to avoid constant folding since we do not have a
-      // good way of serializing them.
-      if (IsOpWhitelisted(op_def)) {
-        continue;
-      }
-      if (op_def->is_stateful()) {
-        return errors::InvalidArgument(
-            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
-            "in function ", function_name, " is stateful. ",
-            "Saving stateful functions is not supported yet.");
-      }
-    }
-    return Status::OK();
-  }
-
-  // Returns whether an op has been whitelisted for use inside map_fns.
-  // Uses a heuristic to whitelist source dataset ops which have been
-  // marked stateful due to b/65524810.
-  // Also looks up the `op_def->name` in the global
-  // `WhitelistedStatefulOpRegistry`.
-  bool IsOpWhitelisted(const OpDef* op_def) const {
-    return (StringPiece(op_def->name()).ends_with("Dataset") &&
-            op_def->output_arg_size() == 1 &&
-            op_def->output_arg(0).type() == DT_VARIANT) ||
-           dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
-               op_def->name());
-  }
-
-  bool HasAttr(const string& op_type_name, const string& attr_name) const;
-
-  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
-    for (auto attr : op_def->attr()) {
-      if (attr.name() == attr_name) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
-    if (attr_value.has_func()) {
-      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
-    } else if (attr_value.has_list()) {
-      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
-        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
-      }
-    }
-    return Status::OK();
-  }
-
-  GraphDefBuilder* b_;
-};
-
-class StatsAggregator;
-
-// A cut-down version of OpKernelContext for running computations in
-// iterators. Note that we cannot simply use OpKernelContext here
-// because we might run computation in an iterator whose lifetime is
-// not nested within the lifetime of a single OpKernelContext
-// (e.g. asynchronous prefetching).
-//
-// TODO(mrry): We will probably need to support more of
-// OpKernelContext here. For example, should allocation be handled by
-// the IteratorContext?
-// TODO(mrry): We're making some daring assumptions about the lifetime
-// of the runner passed in here. A runner will be deleted when the original
-// step ends, but all existing runners only close over session-lifetime (or
-// longer-lived) state, so we can make a copy of the function. There's nothing
-// in the definition of the API from which we took the runner to guarantee that
-// what we are doing is safe. We should formalize the properties here.
-class IteratorContext {
- public:
-  struct Params {
-    // Interface to operating system functionality.
-    Env* env;
-
-    // Function call support.
-    std::function<void(std::function<void()>)> runner = nullptr;
-
-    // A function that returns the current `StatsAggregator` instance to be
-    // used when recording statistics about the iterator.
-    //
-    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
-    // is a property of the `IteratorResource` (which this class does not know
-    // about), and (ii) it can change after the `IteratorContext` has been
-    // created. Better suggestions are welcome!
-    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
-        nullptr;
-
-    // The FunctionLibraryRuntime object to be used to make function calls.
-    FunctionLibraryRuntime* lib = nullptr;
-    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
-  };
-
-  explicit IteratorContext(Params params) : params_(std::move(params)) {}
-
-  Env* env() const { return params_.env; }
-
-  std::function<void(std::function<void()>)>* runner() {
-    return &params_.runner;
-  }
-
-  std::shared_ptr<StatsAggregator> stats_aggregator() {
-    if (params_.stats_aggregator_getter) {
-      return params_.stats_aggregator_getter();
-    } else {
-      return nullptr;
-    }
-  }
-
-  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
-    return params_.function_library;
-  }
-
-  FunctionLibraryRuntime* lib() { return params_.lib; }
-
-  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
-
- private:
-  Params params_;
-};
-
-// Represents the current position in a range of outputs, where the
-// range of outputs is typically represented by an `DatasetBase`,
-// defined below.
-class IteratorBase {
- public:
-  virtual ~IteratorBase() {}
-
-  // Gets the next output from the range that this iterator is traversing.
-  //
-  // If at least one output remains in this iterator's range, that
-  // output will be stored in `*out_tensors` and `false` will be
-  // stored in `*end_of_sequence`.
-  //
-  // If no more outputs remain in this iterator's range, `true` will
-  // be stored in `*end_of_sequence`, and the content of
-  // `*out_tensors` will be undefined.
-  //
-  // This method is thread-safe.
-  //
-  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
-  // potentially remove this method.
-  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                         bool* end_of_sequence) = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // iterator.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this iterator.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // Saves the state of this iterator.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
-    return SaveInternal(writer);
-  }
-
-  // Restores the state of this iterator.
-  virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) {
-    return RestoreInternal(ctx, reader);
-  }
-
- protected:
-  // This is needed so that sub-classes of IteratorBase can call
-  // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status SaveParent(IteratorStateWriter* writer,
-                    const std::unique_ptr<IteratorBase>& parent) {
-    return parent->SaveInternal(writer);
-  }
-
-  // This is needed so that sub-classes of IteratorBase can call
-  // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
-                       const std::unique_ptr<IteratorBase>& parent) {
-    return parent->RestoreInternal(ctx, reader);
-  }
-
-  // Saves the state of this iterator recursively.
-  virtual Status SaveInternal(IteratorStateWriter* writer) {
-    return errors::Unimplemented("SaveInternal");
-  }
-
-  // Restores the state of this iterator recursively.
-  virtual Status RestoreInternal(IteratorContext* ctx,
-                                 IteratorStateReader* reader) {
-    return errors::Unimplemented("RestoreInternal");
-  }
-};
-
-// Represents a (potentially infinite) range of outputs, where each
-// output is a tuple of tensors.
-class DatasetBase : public core::RefCounted {
- public:
-  // Returns a new iterator for iterating over the range of elements in
-  // this dataset.
-  //
-  // This method may be called multiple times on the same instance,
-  // and the resulting iterators will have distinct state. Each
-  // iterator will traverse all elements in this dataset from the
-  // start.
-  //
-  // Ownership of the created iterator will be transferred to the caller.
-  //
-  // The prefix identifies the sequence of iterators leading up to the newly
-  // created iterator.
-  virtual std::unique_ptr<IteratorBase> MakeIterator(
-      const string& prefix) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // A human-readable debug string for this dataset.
-  virtual string DebugString() = 0;
-
-  // Serializes the dataset and writes it to the `writer`.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
-    return errors::Unimplemented("DatasetBase::Save");
-  }
-
- protected:
-  // TODO(srbs): Ideally all graph related logic should reside in
-  // GraphDatasetBase. However, that would require Datasets defined in all ops
-  // to derive from GraphDatasetBase. Once that is done we can move
-  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
-  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
-   public:
-    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
-    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
-                            Node** output) {
-      return dataset->AsGraphDefInternal(ctx, this, output);
-    }
-  };
-
-  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
-                                    DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return AsGraphDefInternal(b, node);
-  }
-
-  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return errors::Unimplemented("AsGraphDefInternal");
-  }
-};
-
-// Base-class for datasets that are built by ops.
-class GraphDatasetBase : public DatasetBase {
- public:
-  GraphDatasetBase(OpKernelContext* ctx)
-      : op_name_(ctx->op_kernel().type_string()) {}
-
-  const string op_name() const { return op_name_; }
-
-  Status Save(OpKernelContext* ctx,
-              IteratorStateWriter* writer) const override {
-    string serialized_graph_def;
-    string output_node;
-    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-    return Status::OK();
-  }
-
-  // Key for storing the Dataset graph in the serialized format.
-  static const char kDatasetGraphKey[];
-
-  // Key for storing the output node of the Dataset graph in the serialized
-  // format.
-  static const char kDatasetGraphOutputNodeKey[];
-
- private:
-  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
-                   string* output_node) const;
-
-  const string op_name_;
-};
-
-// Represents an iterator that is associated with a particular parent dataset.
-template <class DatasetType>
-class DatasetIterator : public IteratorBase {
- public:
-  struct Params {
-    // Owns one reference on the shared dataset resource.
-    const DatasetType* dataset;
-
-    // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
-  };
-
-  explicit DatasetIterator(const Params& params) : params_(params) {
-    params_.dataset->Ref();
-  }
-
-  ~DatasetIterator() override { params_.dataset->Unref(); }
-
-  // The dataset from which this iterator was created.
-  const DatasetType* dataset() const { return params_.dataset; }
-
-  // The sequence of iterators leading up to this iterator.
-  const string prefix() const { return params_.prefix; }
-
-  const DataTypeVector& output_dtypes() const override {
-    return params_.dataset->output_dtypes();
-  }
-
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return params_.dataset->output_shapes();
-  }
-
-  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                 bool* end_of_sequence) final {
-    port::Tracing::TraceMe activity(params_.prefix);
-    Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
-    if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
-      s = errors::Internal(
-          "Iterator \"", params_.prefix,
-          "\" returned OutOfRange without setting `*end_of_sequence`. This "
-          "indicates that an error may have occurred. Original message: ",
-          s.error_message());
-      LOG(ERROR) << s;
-    }
-    return s;
-  }
-
-  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
-    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
-    return IteratorBase::Save(ctx, writer);
-  }
-
- protected:
-  // Internal implementation of GetNext that is wrapped in tracing logic.
-  virtual Status GetNextInternal(IteratorContext* ctx,
-                                 std::vector<Tensor>* out_tensors,
-                                 bool* end_of_sequence) = 0;
-
-  string full_name(const string& name) const {
-    return strings::StrCat(prefix(), ":", name);
-  }
-
- private:
-  Params params_;
-};
-
-// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
-// graph execution engine.
-class DatasetOpKernel : public OpKernel {
- public:
-  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Encapsulates the work required to plug unary Datasets into the core
-// TensorFlow graph execution engine.
-class UnaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase** output) = 0;
-};
-
-// Encapsulates the work required to plug binary Datasets into the core
-// TensorFlow graph execution engine.
-class BinaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase* another_input,
-                           DatasetBase** output) = 0;
-};
-
-// Validates and extracts a `DatasetBase` object from `tensor`.
-//
-// `tensor` must have been written by a call to SetVariantTensorToDataset().
-//
-// The retrieved pointer is a borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset);
-
-// Stores a `DatasetBase` object in `tensor`.
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
-
-}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index e7224bb547f..132808a5f14 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -155,7 +155,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // Determine the size of the output tensors:
         // * dense_shape will be [`row_shape + 1`].
-        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
+        Tensor dense_shape(ctx->allocator({}), DT_INT64, {row_ndims + 1});
         auto dense_shape_vec = dense_shape.vec<int64>();
         for (size_t i = 0; i < row_ndims; ++i) {
           if (row_shape.dim_size(i) == -1) {
@@ -215,10 +215,10 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // * indices will be [`total_elements`, `row_shape + 1`].
         // * values will be [`total_elements`].
-        Tensor indices(cpu_allocator(), DT_INT64,
+        Tensor indices(ctx->allocator({}), DT_INT64,
                        {total_elements, row_ndims + 1});
         Tensor values(
-            cpu_allocator(),
+            ctx->allocator({}),
             DatasetIterator<Dataset<T>>::dataset()->input_->output_dtypes()[0],
             {total_elements});
         auto indices_matrix = indices.matrix<int64>();
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index eb047e10ecf..834c06bb930 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
-
 namespace {
 
 // See documentation in ../ops/dataset_ops.cc for a high-level
@@ -510,10 +509,6 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
-    // A resource name for the temporary window dataset that is
-    // created as the input to the reduce function.
-    static constexpr const char* kWindowResourceName = "__window_dataset";
-
     const DatasetBase* const input_;
     const NameAttrList key_func_;
     const NameAttrList reduce_func_;
@@ -537,5 +532,4 @@ REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
                         GroupByWindowDatasetOp);
 
 }  // namespace
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 56044a3d41a..dd5f4a4554b 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -82,7 +83,7 @@ class IteratorResource : public ResourceBase {
  public:
   IteratorResource(const DataTypeVector& output_dtypes,
                    const std::vector<PartialTensorShape>& output_shapes,
-                   const int graph_def_version,
+                   const int /*unused: graph_def_version*/,
                    std::unique_ptr<DeviceMgr> device_mgr,
                    std::unique_ptr<FunctionLibraryDefinition> flib_def,
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
@@ -93,8 +94,7 @@ class IteratorResource : public ResourceBase {
         lib_(lib),
         iterator_(nullptr),
         output_dtypes_(output_dtypes),
-        output_shapes_(output_shapes),
-        graph_def_version_(graph_def_version) {}
+        output_shapes_(output_shapes) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
@@ -223,7 +223,6 @@ class IteratorResource : public ResourceBase {
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
-  const int graph_def_version_;
 };
 
 // Helper class for reading data from a VariantTensorData object.
@@ -430,13 +429,10 @@ class IteratorStateVariant {
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
                                        kIteratorVariantTypeName);
 
-// TODO(mrry): Can we simply use the template kernel here?
 class IteratorHandleOp : public OpKernel {
  public:
   explicit IteratorHandleOp(OpKernelConstruction* ctx)
       : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_STRING, TensorShape({2}),
-                                                 &handle_, nullptr));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
@@ -460,56 +456,51 @@ class IteratorHandleOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    FunctionLibraryRuntime* lib = context->function_library();
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
-    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
-    // If the iterator is shared then we construct a new FLR, and pass that in.
-    // NOTE(mrry,rohanj): In this case it is not possible to call remote
-    // functions from the iterator. We may add this functionality if there
-    // is sufficient demand, but it will require a significant refactoring.
-    if (!name_.empty()) {
-      lib = CreateFLR(context, &device_mgr, &flib_def, &pflr);
-    }
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        FunctionLibraryRuntime* lib = context->function_library();
+        std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+        // If the iterator is shared then we construct a new FLR, and pass that
+        // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
+        // functions from the iterator. We may add this functionality if there
+        // is sufficient demand, but it will require a significant refactoring.
+        if (!name_.empty()) {
+          lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
+        }
 
-    if (resource_ == nullptr) {
-      ResourceMgr* mgr = context->resource_manager();
-      OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
-      IteratorResource* resource;
-      OP_REQUIRES_OK(
-          context,
-          mgr->LookupOrCreate<IteratorResource>(
-              cinfo_.container(), cinfo_.name(), &resource,
-              [lib, &device_mgr, &flib_def, &pflr, this](IteratorResource** ret)
-                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                    *ret = new IteratorResource(
-                        output_dtypes_, output_shapes_, graph_def_version_,
-                        std::move(device_mgr), std::move(flib_def),
-                        std::move(pflr), lib);
-                    return Status::OK();
-                  }));
+        IteratorResource* resource;
+        OP_REQUIRES_OK(
+            context,
+            mgr->LookupOrCreate<IteratorResource>(
+                cinfo_.container(), cinfo_.name(), &resource,
+                [lib, &device_mgr, &flib_def, &pflr,
+                 this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  *ret = new IteratorResource(
+                      output_dtypes_, output_shapes_, graph_def_version_,
+                      std::move(device_mgr), std::move(flib_def),
+                      std::move(pflr), lib);
+                  return Status::OK();
+                }));
 
-      Status s = VerifyResource(resource);
-      if (TF_PREDICT_FALSE(!s.ok())) {
-        resource->Unref();
-        context->SetStatus(s);
-        return;
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
+
+        resource_ = resource;
       }
-
-      auto h = handle_.AccessTensor(context)->template flat<string>();
-      h(0) = cinfo_.container();
-      h(1) = cinfo_.name();
-      resource_ = resource;
-    }
-    if (context->expected_output_dtype(0) == DT_RESOURCE) {
-      OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                  context, 0, cinfo_.container(), cinfo_.name(),
-                                  MakeTypeIndex<IteratorResource>()));
-    } else {
-      context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<IteratorResource>()));
   }
 
  private:
@@ -526,15 +517,32 @@ class IteratorHandleOp : public OpKernel {
     return Status::OK();
   }
 
-  FunctionLibraryRuntime* CreateFLR(
+  template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+  static inline To down_cast(From* f) {  // so we only accept pointers
+    static_assert(
+        (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+        "target type not derived from source type");
+
+    // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+    // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+    assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+    return static_cast<To>(f);
+  }
+
+  FunctionLibraryRuntime* CreatePrivateFLR(
       OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
       std::unique_ptr<FunctionLibraryDefinition>* flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
-    Device* device = new ThreadPoolDevice(
-        SessionOptions(), ctx->device()->attributes().name(), Bytes(256 << 20),
-        DeviceLocality(), cpu_allocator());
-
-    device_mgr->reset(new DeviceMgr({device}));
+    // Wrap the existing device in order to see any captured resources
+    // in its resource manager. The existing device will outlive the
+    // IteratorResource, because we are storing the IteratorResource
+    // in that device's resourc manager.
+    Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+        ctx->device()->name(), down_cast<Device*>(ctx->device()),
+        false /* owns_underlying */, false /* isolate_session_state */);
+    device_mgr->reset(new DeviceMgr({wrapped_device}));
     flib_def->reset(new FunctionLibraryDefinition(
         *ctx->function_library()->GetFunctionLibraryDefinition()));
     pflr->reset(new ProcessFunctionLibraryRuntime(
@@ -542,13 +550,12 @@ class IteratorHandleOp : public OpKernel {
         {} /* TODO(mrry): OptimizerOptions? */,
         nullptr /* TODO(mrry): ClusterFLR */));
 
-    return (*pflr)->GetFLR(device->name());
+    return (*pflr)->GetFLR(ctx->device()->name());
   }
 
   mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
   IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
-  PersistentTensor handle_ GUARDED_BY(mu_);
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index c529f671f2b..9ce263732f6 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -183,7 +183,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               TensorShape component_shape(
                   batch_results_[current_batch_index_].output[i].shape());
               component_shape.set_dim(0, num_elements);
-              Tensor component(cpu_allocator(), output[i].dtype(),
+              Tensor component(ctx->allocator({}), output[i].dtype(),
                                component_shape);
               TF_RETURN_IF_ERROR(
                   CopyPartialBatch(&component, output[i], num_elements));
@@ -244,7 +244,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      void EnsureOutputAllocated(BatchResult* batch_result,
+      void EnsureOutputAllocated(IteratorContext* ctx,
+                                 BatchResult* batch_result,
                                  const std::vector<Tensor>& return_values) {
         mutex_lock l(batch_result->mu);
         if (batch_result->output_allocated) {
@@ -254,7 +255,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
           component_shape.AppendShape(return_values[i].shape());
-          Tensor component(cpu_allocator(), return_values[i].dtype(),
+          Tensor component(ctx->allocator({}), return_values[i].dtype(),
                            component_shape);
           batch_result->output.emplace_back(std::move(component));
         }
@@ -285,10 +286,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               dataset()->captured_func_->RunAsync(
                   ctx, std::move(input_element), &result->return_values,
                   [this, ctx, result, batch_result, offset](Status ret_status) {
-                    delete ctx;
                     result->status.Update(ret_status);
                     if (ret_status.ok()) {
-                      EnsureOutputAllocated(batch_result,
+                      EnsureOutputAllocated(ctx, batch_result,
                                             result->return_values);
                       const size_t num_components =
                           result->return_values.size();
@@ -318,6 +318,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                         }
                       }
                     }
+                    delete ctx;
                     // NOTE(mrry): We clear the return values here to release
                     // any memory associated with them and to paralellize the
                     // destruction of the tensors (which can be surprisingly
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 346eca0bb2a..cfb4efda9a5 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
@@ -24,102 +25,6 @@ namespace {
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-// The following five functions are copied from padding_fifo_queue.cc.
-// TODO(mrry): Reconcile these functions with the similar methods in the
-// queue implementation.
-Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
-  DCHECK_NE(parent->dim_size(0), 0);
-  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
-    TensorShape chip_shape = parent->shape();
-    chip_shape.RemoveDim(0);
-    return errors::Internal(
-        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
-        "element is greater than number of elements in parent slice.  ",
-        "Shapes are: [element]: ", element.shape().DebugString(),
-        ", [parent slice]: ", chip_shape.DebugString());
-  }
-  return Status::OK();
-}
-
-template <typename T, int NDIMS>
-Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                  int index) {
-  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
-  if (element.NumElements() == 0) {
-    return Status::OK();
-  }
-  auto element_t = element.tensor<T, NDIMS>();
-  auto parent_t = parent->tensor<T, NDIMS + 1>();
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
-  slice_indices[0] = index;
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
-  slice_size[0] = 1;
-  for (size_t i = 1; i < slice_size.size(); ++i) {
-    slice_size[i] = element_t.dimension(i - 1);
-  }
-  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
-  return Status::OK();
-}
-
-template <int NDIMS>
-Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
-                                          int index) {
-#define HANDLE_TYPE(T)                                                   \
-  case DataTypeToEnum<T>::value: {                                       \
-    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
-  }
-
-  switch (element.dtype()) {
-    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-    default:
-      return errors::Unimplemented(
-          "HandleElementToLargerSliceWithRank Unhandled data type: ",
-          element.dtype());
-  }
-}
-
-Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                int index) {
-  if (parent->dims() != element.dims() + 1) {
-    return errors::Internal(
-        "Mismatched ranks.  Element's rank is: ", element.dims(),
-        " but element is meant to be a slice in output Tensor having rank: ",
-        parent->dims(), " (should be: ", element.dims() + 1, ")");
-  }
-
-#define HANDLE_DIMS(NDIMS)                                                  \
-  case NDIMS: {                                                             \
-    TF_RETURN_IF_ERROR(                                                     \
-        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
-    return Status::OK();                                                    \
-  }
-
-  switch (element.dims()) {
-    HANDLE_DIMS(0);
-    HANDLE_DIMS(1);
-    HANDLE_DIMS(2);
-    HANDLE_DIMS(3);
-    HANDLE_DIMS(4);
-#undef HANDLE_DIMS
-    default:
-      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
-                                   element.dims());
-  }
-}
-
-Status SetElementZero(Tensor* element, const Tensor& padding) {
-#define HANDLE_TYPE(T)                                     \
-  if (element->dtype() == DataTypeToEnum<T>::value) {      \
-    element->flat<T>().setConstant(padding.scalar<T>()()); \
-    return Status::OK();                                   \
-  }
-  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("SetElementZero Unhandled data type: ",
-                               element->dtype());
-}
-
 class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx)
@@ -376,20 +281,27 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
           // 2. Copy each batch element to the appropriate location in
           // the output component tensor.
-          Tensor batch_component(cpu_allocator(),
+          Tensor batch_component(ctx->allocator({}),
                                  output_dtypes()[component_index],
                                  batch_component_shape);
-          TF_RETURN_IF_ERROR(SetElementZero(
+          TF_RETURN_IF_ERROR(batch_util::SetElementZero(
               &batch_component, dataset()->padding_values_[component_index]));
 
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
+          TensorShape component_shape({});
+          for (int i = 1; i < batch_component_shape.dims(); ++i) {
+            component_shape.AddDim(batch_component_shape.dim_size(i));
+          }
           for (int64 i = 0; i < num_batch_elements; ++i) {
-            TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component));
-
-            TF_RETURN_IF_ERROR(CopyElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component, i));
+            // Take the fast path if possible.
+            if (batch_elements[i][component_index].shape() == component_shape) {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            }
           }
           out_tensors->push_back(std::move(batch_component));
         }
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
index bc638864b01..210b9ad1b84 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -99,7 +99,7 @@ class RandomDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
         value_tensor.scalar<int64>()() = Random();
         out_tensors->emplace_back(std::move(value_tensor));
         *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index d0bc61acd99..b57518e678e 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -100,7 +100,7 @@ class RangeDatasetOp : public DatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
         value_tensor.scalar<int64>()() = next_;
         out_tensors->emplace_back(std::move(value_tensor));
         *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index aa39fffc2e3..34d7d9f914d 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -141,7 +141,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
-              Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor line_tensor(ctx->allocator({}), DT_STRING, {});
               line_tensor.scalar<string>()() = line_contents;
               out_tensors->emplace_back(std::move(line_tensor));
               *end_of_sequence = false;
@@ -384,7 +384,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
               // Produce the record as output.
-              Tensor record_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
               record_tensor.scalar<string>()() = record;
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
@@ -589,7 +589,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         do {
           // We are currently processing a file, so try to read the next record.
           if (reader_) {
-            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
+            Tensor result_tensor(ctx->allocator({}), DT_STRING, {});
             Status s = reader_->ReadRecord(&result_tensor.scalar<string>()());
             if (s.ok()) {
               out_tensors->emplace_back(std::move(result_tensor));
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 13c2501bbbd..d636c37afe2 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -128,8 +128,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         while (i_ < dataset()->count_) {
           // Fetch and throw away Tensors.
           std::vector<Tensor> dummy_out_tensors;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &dummy_out_tensors,
-                                                  end_of_sequence));
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &dummy_out_tensors, end_of_sequence));
           if (*end_of_sequence) {
             // We reached the end before the count was reached.
             input_impl_.reset();
@@ -140,8 +140,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         }
 
         // Return GetNext() on the underlying iterator.
-        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
-                                                end_of_sequence));
+        TF_RETURN_IF_ERROR(
+            input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
         if (*end_of_sequence) {
           input_impl_.reset();
         }
@@ -184,8 +184,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU),
-                        SkipDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU), SkipDatasetOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
index 0286825af3e..f4698bdaf7a 100644
--- a/tensorflow/core/kernels/data/sql/BUILD
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -33,6 +33,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset",
         "//tensorflow/core/lib/db:sqlite",
     ],
 )
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/sql/query_connection.h
index f31017bd198..e9ffca202ff 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/sql/query_connection.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class IteratorContext;
+
 namespace sql {
 // This interface allows a user to connect to a database, execute a query, and
 // iterate over the result set, putting the results into an output tensor.
@@ -56,7 +58,7 @@ class QueryConnection {
   // If there are no more rows in the result set, then instead `true` will be
   // stored in `*end_of_sequence`, and the content of `*out_tensors` will be
   // undefined.
-  virtual Status GetNext(std::vector<Tensor>* out_tensors,
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) = 0;
 };
 
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
index 029a0aab972..7cd07bd8eca 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
@@ -48,14 +49,16 @@ Status SqliteQueryConnection::Close() {
   return Status::OK();
 }
 
-Status SqliteQueryConnection::GetNext(std::vector<Tensor>* out_tensors,
+Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
+                                      std::vector<Tensor>* out_tensors,
                                       bool* end_of_sequence) {
   if (!stmt_) TF_RETURN_IF_ERROR(PrepareQuery());
   TF_RETURN_IF_ERROR(stmt_.Step(end_of_sequence));
   if (!*end_of_sequence) {
     for (int i = 0; i < column_count_; i++) {
       DataType dt = output_types_[i];
-      Tensor tensor(cpu_allocator(), dt, {});
+      // TODO(mrry): Pass in the `IteratorContext::allocator()`.
+      Tensor tensor(ctx->allocator({}), dt, {});
       FillTensorWithResultSetEntry(dt, i, &tensor);
       out_tensors->emplace_back(std::move(tensor));
     }
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
index 787c17d6c00..81b19530b7d 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
@@ -32,7 +32,7 @@ class SqliteQueryConnection : public QueryConnection {
   Status Open(const string& data_source_name, const string& query,
               const DataTypeVector& output_types) override;
   Status Close() override;
-  Status GetNext(std::vector<Tensor>* out_tensors,
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) override;
 
  private:
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index 72302190802..d50e9c9cf97 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -116,7 +116,7 @@ class SqlDatasetOp : public DatasetOpKernel {
         }
       }
 
-      Status GetNextInternal(IteratorContext* /*ctx*/,
+      Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
@@ -132,7 +132,7 @@ class SqlDatasetOp : public DatasetOpKernel {
             return s;
           }
         }
-        return query_connection_->GetNext(out_tensors, end_of_sequence);
+        return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
new file mode 100644
index 00000000000..ff412a4671b
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -0,0 +1,646 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <deque>
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
+                                      const PartialTensorShape& b) {
+  // Returns true if dims[a] >= dims[b], or are compatible.
+  if (a.unknown_rank()) return true;
+  if (a.dims() != b.dims()) return false;
+  for (int d = 0; d < a.dims(); ++d) {
+    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
+    if (a.dim_size(d) < b.dim_size(d)) return false;
+  }
+  return true;
+}
+
+DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
+  DataTypeVector out;
+  out.reserve(dtypes.size() + 1);
+  out.push_back(DT_VARIANT);  // The queue component.
+  for (const DataType& d : dtypes) out.push_back(d);
+  return out;
+}
+
+std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
+    const std::vector<PartialTensorShape>& shapes) {
+  std::vector<PartialTensorShape> out;
+  out.reserve(shapes.size() + 1);
+  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
+  for (PartialTensorShape s : shapes) {
+    s.InsertDim(0, -1);  // Unknown batch size.
+    out.push_back(std::move(s));
+  }
+  return out;
+}
+
+class EnqueueInQueueDatasetOp;
+
+class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
+ public:
+  PrependFromQueueAndPaddedBatchDataset(
+      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
+      const DataTypeVector& dtypes,
+      const std::vector<PartialTensorShape>& shapes,
+      std::vector<Tensor> padding_values)
+      : GraphDatasetBase(ctx),
+        batch_size_(batch_size),
+        input_(input),
+        dtypes_(dtypes),
+        shapes_(shapes),
+        padding_values_(std::move(padding_values)),
+        dtypes_with_queue_(PrependQueueType(dtypes)),
+        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
+    input_->Ref();
+  }
+
+  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(
+        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return dtypes_with_queue_;
+  }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return batched_shapes_with_queue_;
+  }
+
+  string DebugString() override {
+    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
+  }
+
+ protected:
+  Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph = nullptr;
+    TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
+    Node* batch_size = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
+
+    std::vector<Node*> padded_shapes;
+    padded_shapes.reserve(shapes_.size());
+    for (int i = 0; i < shapes_.size(); i++) {
+      Node* node;
+      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
+      for (int j = 0; j < shapes_[i].dims(); j++) {
+        t.vec<int64>()(j) = shapes_[i].dim_size(j);
+      }
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padded_shapes.emplace_back(node);
+    }
+
+    std::vector<Node*> padding_values;
+    padding_values.reserve(padding_values_.size());
+    for (const Tensor& t : padding_values_) {
+      Node* node;
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padding_values.emplace_back(node);
+    }
+
+    AttrValue output_types;
+    b->BuildAttrValue(dtypes_, &output_types);
+
+    AttrValue output_shapes;
+    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
+
+    AttrValue N;
+    b->BuildAttrValue<int64>(shapes_.size(), &N);
+
+    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
+                                     {{2, padded_shapes}, {3, padding_values}},
+                                     {{"Toutput_types", output_types},
+                                      {"output_shapes", output_shapes},
+                                      {"N", N}},
+                                     output));
+
+    return Status::OK();
+  }
+
+ private:
+  friend class EnqueueInQueueDatasetOp;
+
+  class Iterator
+      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params),
+          queue_(new TensorQueue(/*input_impl*/
+                                 params.dataset->input_->MakeIterator(
+                                     params.prefix),
+                                 params.dataset->dtypes_,
+                                 params.dataset->shapes_)) {}
+
+    ~Iterator() override { queue_->Unref(); }
+
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      std::vector<std::vector<Tensor>> batch;
+      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
+                                         end_of_sequence));
+      const auto& dtypes = dataset()->dtypes_;
+      const auto& shapes = dataset()->shapes_;
+      const auto& input_shapes = dataset()->input_->output_shapes();
+      const auto& padding_values = dataset()->padding_values_;
+      const int64 batch_size = batch.size();
+      out_tensors->reserve(dtypes.size());
+
+      std::vector<TensorShape> max_shapes;  // Of non-queue components.
+      for (int i = 0; i < dtypes.size(); ++i) {
+        const PartialTensorShape& shape = shapes[i];
+        TensorShape out_shape({batch_size});
+        for (int r = 0; r < shape.dims(); ++r) {
+          if (shape.dim_size(r) >= 0) {
+            // padded_shape[r] is known.
+            out_shape.AddDim(shape.dim_size(r));
+          } else {
+            // padded_shape[r] is unknown, find the maximum across
+            // the batch.
+            int64 dim = 0;
+            for (int b = 0; b < batch.size(); ++b) {
+              dim = std::max(dim, batch[b][i].dim_size(r));
+            }
+            out_shape.AddDim(dim);
+          }
+        }
+        max_shapes.push_back(std::move(out_shape));
+      }
+
+      Tensor queues_t(cpu_allocator(), DT_VARIANT, TensorShape({batch_size}));
+      if (!batch.empty()) {
+        auto queues = queues_t.flat<Variant>();
+        Variant& queue_inserter = queues(0);
+        queue_inserter = TensorQueueInserter();
+        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
+        for (int b = 1; b < batch.size(); ++b) {
+          // Copy the TensorQueueInserter.  Each copy increments the
+          // Ref on the queue_.
+          queues(b) = queues(0);
+        }
+      }
+      out_tensors->push_back(std::move(queues_t));
+
+      for (int i = 0; i < max_shapes.size(); ++i) {
+        Tensor component(cpu_allocator(), dtypes[i], max_shapes[i]);
+        // Try hard to take the fast path.
+        if (shapes[i].IsFullyDefined() &&
+            shapes[i].IsIdenticalTo(input_shapes[i])) {
+          // Take the fast path if we know all the shapes statically.
+          for (int64 b = 0; b < batch.size(); ++b) {
+            TF_RETURN_IF_ERROR(
+                batch_util::CopyElementToSlice(batch[b][i], &component, b));
+          }
+        } else {
+          TF_RETURN_IF_ERROR(
+              batch_util::SetElementZero(&component, padding_values[i]));
+          for (int64 b = 0; b < batch.size(); ++b) {
+            if (batch[b][i].shape() == max_shapes[i]) {
+              TF_RETURN_IF_ERROR(
+                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch[b][i], &component, b));
+            }
+          }
+        }
+        out_tensors->push_back(std::move(component));
+      }
+
+      // end_of_sequence was set before we populated out_tensors, so
+      // it's ok to return now.
+      return Status::OK();
+    }
+
+   protected:
+    // Work around bug in MSVC that disallows access to protected
+    // members of Iterator from within TensorQueue.
+    class TensorQueue;
+    friend class TensorQueue;
+
+    class TensorQueue : public core::RefCounted {
+     public:
+      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
+                  const DataTypeVector& dtypes,
+                  const std::vector<PartialTensorShape>& shapes)
+          : dtypes_(dtypes),
+            shapes_(shapes),
+            input_impl_(std::move(input_impl)) {}
+
+      void MaybeWaitForNotificationLocked(mutex_lock* lock)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // This essentially just releases the lock and immediately relocks.
+        cv_.wait_for(*lock, std::chrono::milliseconds(0));
+      }
+
+      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
+
+      Status GetNext(IteratorContext* ctx, const int64 batch_size,
+                     std::vector<std::vector<Tensor>>* batch,
+                     bool* end_of_sequence) {
+        mutex_lock lock(mu_);
+
+        *end_of_sequence = false;
+
+        for (int64 b = 0; b < batch_size;) {
+          if (!entries_.empty()) {
+            batch->push_back(std::move(entries_.front()));
+            entries_.pop_front();
+            ++b;
+            continue;
+          } else {
+            if (input_impl_) {
+              // There's still input coming in.
+              std::vector<Tensor> tensors;
+              bool input_end;
+              TF_RETURN_IF_ERROR(
+                  input_impl_->GetNext(ctx, &tensors, &input_end));
+              if (!input_end) {
+                batch->push_back(std::move(tensors));
+                ++b;
+                continue;
+              } else {
+                input_impl_.reset();
+              }
+            }
+            if (!input_impl_) {
+              // There's no more input coming in.
+              if (RefCountIsOne()) {
+                // No TensorQueueInserters in the wild.
+                if (batch->empty()) {
+                  *end_of_sequence = true;
+                }
+                break;
+              } else {
+                MaybeWaitForNotificationLocked(&lock);
+                // If there's data available, try to add entries again.
+                // Otherwise return a smaller batch and hope the next
+                // iterator request has a non-empty or unused queue_.
+                if (entries_.empty()) {
+                  break;
+                }
+              }
+            }
+          }
+        }  // for (int64 b = ... batch_size)
+        return Status::OK();
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) {
+        if (tensors.size() != dtypes_.size()) {
+          return errors::InvalidArgument(
+              "TensorQueue::Insert: mismatched number of tensors.  Queue "
+              "expects ",
+              dtypes_.size(), " tensors but tried to insert ", tensors.size());
+        }
+        for (int i = 0; i < tensors.size(); ++i) {
+          if (tensors[i].dtype() != dtypes_[i]) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched dtypes at component ", i,
+                ".  Attempted "
+                "to insert tensor of type ",
+                DataTypeString(tensors[i].dtype()),
+                " but queue expected type: ", DataTypeString(dtypes_[i]));
+          }
+          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched shapes at component ", i,
+                ".  Attempted "
+                "to insert tensor with shape ",
+                tensors[i].shape().DebugString(),
+                " but queue expected shape: ", shapes_[i].DebugString());
+          }
+        }
+        mutex_lock lock(mu_);
+        entries_.push_back(tensors);
+        NotifyLocked();
+        return Status::OK();
+      }
+
+      Status Save(Iterator* iter, IteratorStateWriter* writer) {
+        mutex_lock lock(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(iter->SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
+                                               entries_.size()));
+        for (int64 b = 0; b < entries_.size(); ++b) {
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
+                                                    "[", b, "][", i, "]"),
+                                    entries_[b][i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status Restore(Iterator* iter, IteratorContext* ctx,
+                     IteratorStateReader* reader) {
+        mutex_lock l(mu_);
+        if (reader->Contains(iter->full_name("input_exhausted"))) {
+          input_impl_.reset();
+        } else {
+          input_impl_ = iter->dataset_input()->MakeIterator(iter->prefix());
+          TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_));
+        }
+        entries_.clear();
+        int64 entries_size = -1;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
+        if (entries_size < 0) {
+          return errors::DataLoss(
+              "Expected entries_size key '", iter->full_name("entries_size"),
+              "' to have nonnegative value, but saw: ", entries_size);
+        }
+        for (int64 b = 0; b < entries_size; ++b) {
+          std::vector<Tensor> entry;
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            Tensor value;
+            TF_RETURN_IF_ERROR(
+                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
+                                                   "[", b, "][", i, "]"),
+                                   &value));
+            entry.push_back(std::move(value));
+          }
+          entries_.push_back(std::move(entry));
+        }
+        return Status::OK();
+      }
+
+      mutex* mu() { return &mu_; }
+
+     private:
+      DataTypeVector dtypes_;
+      std::vector<PartialTensorShape> shapes_;
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
+      condition_variable cv_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* dataset_input() const { return dataset()->input_; }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      return queue_->Save(this, writer);
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      return queue_->Restore(this, ctx, reader);
+    }
+
+   public:
+    class TensorQueueInserter {
+     public:
+      TensorQueueInserter() : queue_(nullptr) {}
+
+      void set_queue(TensorQueue* queue) {
+        queue_ = queue;
+        queue_->Ref();
+      }
+
+      TensorQueueInserter(const TensorQueueInserter& rhs) {
+        queue_ = rhs.queue_;
+        queue_->Ref();
+      };
+
+      TensorQueueInserter(TensorQueueInserter&& rhs) {
+        queue_ = rhs.queue_;
+        rhs.queue_ = nullptr;
+      }
+
+      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
+
+      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
+      string DebugString() const { return TypeName(); }
+
+      void Encode(VariantTensorData*) const {}
+      bool Decode(const VariantTensorData&) { return false; }
+
+      ~TensorQueueInserter() {
+        if (queue_) {
+          mutex_lock lock(*queue_->mu());
+          queue_->Unref();
+          queue_->NotifyLocked();
+          queue_ = nullptr;
+        }
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) const {
+        CHECK(queue_);
+        return queue_->Insert(tensors);
+      }
+
+     private:
+      mutable TensorQueue* queue_;
+    };
+
+   private:
+    TensorQueue* const queue_;
+  };
+
+ private:
+  const int64 batch_size_;
+  const DatasetBase* input_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+  const std::vector<Tensor> padding_values_;
+  const DataTypeVector dtypes_with_queue_;
+  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
+};
+
+class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    OpInputList padded_shape_tensors;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padded_shapes", &padded_shape_tensors));
+    std::vector<PartialTensorShape> padded_shapes;
+    padded_shapes.reserve(padded_shape_tensors.size());
+    OP_REQUIRES(ctx,
+                padded_shape_tensors.size() == input->output_shapes().size(),
+                errors::InvalidArgument("Number of padded shapes (",
+                                        padded_shape_tensors.size(),
+                                        ") must match the number of components "
+                                        "in the input dataset's elements (",
+                                        input->output_shapes().size(), ")"));
+    for (const Tensor& padded_shape_t : padded_shape_tensors) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
+                  errors::InvalidArgument("All padded shapes must be vectors"));
+      PartialTensorShape padded_shape;
+      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
+                              padded_shape_t.vec<int64>().data(),
+                              padded_shape_t.NumElements(), &padded_shape));
+      padded_shapes.push_back(std::move(padded_shape));
+    }
+
+    OP_REQUIRES(
+        ctx, input->output_dtypes() == output_types_,
+        errors::InvalidArgument("Input dataset and this dataset "
+                                "have different output_types: ",
+                                DataTypeVectorString(input->output_dtypes()),
+                                " and ", DataTypeVectorString(output_types_)));
+
+    for (int i = 0; i < input->output_shapes().size(); ++i) {
+      // Exclude the queue from the tensor_shapes calculation.
+      const PartialTensorShape& tensor_shape = padded_shapes[i];
+      OP_REQUIRES(
+          ctx,
+          IsGreaterEqualToOrCompatibleWith(tensor_shape,
+                                           input->output_shapes()[i]),
+          errors::InvalidArgument("Incompatible input shapes at component ", i,
+                                  " between input dataset this dataset: ",
+                                  input->output_shapes()[i].DebugString(),
+                                  " vs. ", tensor_shape.DebugString()));
+    }
+
+    OpInputList padding_values_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padding_values", &padding_values_list));
+    std::vector<Tensor> padding_values;
+    OP_REQUIRES(ctx,
+                padding_values_list.size() == input->output_shapes().size(),
+                errors::InvalidArgument(
+                    "Number of padding values (", padding_values_list.size(),
+                    ") must match the number of components in the input "
+                    "dataset's elements (",
+                    input->output_shapes().size(), ")"));
+    for (int i = 0; i < padding_values_list.size(); ++i) {
+      const Tensor& padding_value_t = padding_values_list[i];
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
+          errors::InvalidArgument(
+              "All padding values must be scalars; but at component ", i,
+              " saw shape: ", padding_value_t.shape().DebugString()));
+      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
+                  errors::InvalidArgument(
+                      "Mismatched type between padding value ", i,
+                      " and input dataset's component ", i, ": ",
+                      DataTypeString(padding_value_t.dtype()), " vs. ",
+                      DataTypeString(input->output_dtypes()[i])));
+      padding_values.push_back(padding_value_t);
+    }
+
+    *output = new PrependFromQueueAndPaddedBatchDataset(
+        ctx, batch_size, input, output_types_, padded_shapes,
+        std::move(padding_values));
+  }
+
+ private:
+  DataTypeVector output_types_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
+    PrependFromQueueAndPaddedBatchDatasetOp);
+
+class EnqueueInQueueDatasetOp : public OpKernel {
+ public:
+  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    using TensorQueueInserter =
+        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
+
+    // TODO(ebrevdo): accept list of sequence lengths to do proper
+    // sub-slicing of tensors for placement into the queue?
+    const Tensor& tensor_queue_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
+                errors::InvalidArgument("queue must be a vector, saw shape: ",
+                                        tensor_queue_t.shape().DebugString()));
+    std::vector<const TensorQueueInserter*> inserters;
+    const int64 batch_size = tensor_queue_t.NumElements();
+    inserters.reserve(batch_size);
+    const Variant* variants = tensor_queue_t.flat<Variant>().data();
+    for (int i = 0; i < batch_size; ++i) {
+      const auto* inserter = variants[i].get<TensorQueueInserter>();
+      OP_REQUIRES(ctx, inserter != nullptr,
+                  errors::InvalidArgument(
+                      "Could not access TensorQueueInserter from queue[", i,
+                      "].  Received variant: ", variants[i].DebugString()));
+      inserters.push_back(inserter);
+    }
+
+    OpInputList components;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
+    for (int i = 0; i < components.size(); ++i) {
+      OP_REQUIRES(
+          ctx,
+          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
+          errors::InvalidArgument(
+              "Expected component ", i, " to have batched shape [", batch_size,
+              ",...], but saw shape: ", components[i].shape().DebugString()));
+    }
+    std::vector<TensorShape> element_shapes;
+    for (int i = 0; i < components.size(); ++i) {
+      TensorShape element_shape = components[i].shape();
+      element_shape.RemoveDim(0);
+      element_shapes.push_back(std::move(element_shape));
+    }
+    for (int64 b = 0; b < batch_size; ++b) {
+      std::vector<Tensor> tensors;
+      tensors.reserve(components.size());
+      for (int i = 0; i < components.size(); ++i) {
+        Tensor t(components[i].dtype(), element_shapes[i]);
+        OP_REQUIRES_OK(ctx,
+                       batch_util::CopySliceToElement(components[i], &t, b));
+        tensors.push_back(std::move(t));
+      }
+      // TODO(ebrevdo): Acquire the lock once for all inserters with
+      // the same underlying queue?  Add InsertLocked?
+      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
+                        EnqueueInQueueDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 18adae1ea32..d5be4c77807 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -117,7 +117,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
           out_tensors->reserve(dataset()->tensors_.size());
           for (int i = 0; i < dataset()->tensors_.size(); ++i) {
             const Tensor& t = dataset()->tensors_[i];
-            Tensor t_slice(cpu_allocator(), t.dtype(),
+            Tensor t_slice(ctx->allocator({}), t.dtype(),
                            TensorShape(dataset()->shapes_[i].dim_sizes()));
             TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_));
             out_tensors->emplace_back(std::move(t_slice));
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index 965a60c7e05..1b94ea05440 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                             .HostMemory("input")
                             .HostMemory("output"),
                         CopyOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
@@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                             .HostMemory("input")
                             .HostMemory("output"),
                         DebugIdentityOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
@@ -98,7 +98,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
                           DebugNanCountOp<type>);
 REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
 #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type)        \
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 381add3fb3b..53a23b13060 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -21,7 +21,7 @@ limitations under the License.
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -91,7 +91,7 @@ class CopyOp : public OpKernel {
       Device* device = static_cast<Device*>(context->device());
       // Determine if the input tensor is not on CPU (e.g., on GPU).
       const bool off_host_input = device->device_type() == DEVICE_SYCL &&
-                            !context->input_alloc_attr(0).on_host();
+                                  !context->input_alloc_attr(0).on_host();
 
       if (off_host_input) {
         SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index c4555db453b..0c42f632521 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int32 value;
               OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int32: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int32: ", fields[f]));
               output[f]->flat<int32>()(i) = value;
             }
             break;
@@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int64 value;
               OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int64: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int64: ", fields[f]));
               output[f]->flat<int64>()(i) = value;
             }
             break;
@@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               float value;
               OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid float: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid float: ", fields[f]));
               output[f]->flat<float>()(i) = value;
             }
             break;
@@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               double value;
               OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid double: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid double: ", fields[f]));
               output[f]->flat<double>()(i) = value;
             }
             break;
@@ -208,9 +208,10 @@ class DecodeCSVOp : public OpKernel {
         if (!quoted) {
           while (static_cast<size_t>(current_idx) < input.size() &&
                  input[current_idx] != delim_) {
-            OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') &&
-                                 input[current_idx] != '\n' &&
-                                 input[current_idx] != '\r',
+            OP_REQUIRES(ctx,
+                        (!use_quote_delim_ || input[current_idx] != '"') &&
+                            input[current_idx] != '\n' &&
+                            input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
             field += input[current_idx];
@@ -238,10 +239,11 @@ class DecodeCSVOp : public OpKernel {
           }
 
           OP_REQUIRES(
-              ctx, (static_cast<size_t>(current_idx) < input.size() &&
-                    input[current_idx] == '"' &&
-                    (static_cast<size_t>(current_idx) == input.size() - 1 ||
-                     input[current_idx + 1] == delim_)),
+              ctx,
+              (static_cast<size_t>(current_idx) < input.size() &&
+               input[current_idx] == '"' &&
+               (static_cast<size_t>(current_idx) == input.size() - 1 ||
+                input[current_idx + 1] == delim_)),
               errors::InvalidArgument("Quoted field has to end with quote "
                                       "followed by delim or end"));
 
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 44dcbf834ce..912d04c1536 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -87,10 +87,11 @@ class DecodeImageOp : public OpKernel {
       channels_ = 3;
     } else {
       OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                               channels_ == 4,
-                  errors::InvalidArgument(
-                      "channels must be 0, 1, 3, or 4, got ", channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                  channels_));
     }
     flags_.components = channels_;
 
@@ -114,8 +115,9 @@ class DecodeImageOp : public OpKernel {
 
     if (format_ == kJpgFormat) {
       OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
-                               flags_.ratio == 4 || flags_.ratio == 8,
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
                   errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
                                           flags_.ratio));
       OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
@@ -130,8 +132,9 @@ class DecodeImageOp : public OpKernel {
       string dct_method;
       OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
       OP_REQUIRES(
-          context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-                    dct_method == "INTEGER_ACCURATE"),
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
           errors::InvalidArgument("dct_method must be one of "
                                   "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
       if (dct_method == "INTEGER_FAST") {
@@ -157,9 +160,9 @@ class DecodeImageOp : public OpKernel {
         errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
                                 FileFormatString(magic, input)));
     OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(FileFormatString(magic, input),
-                                        " contents are too large for int: ",
-                                        input.size()));
+                errors::InvalidArgument(
+                    FileFormatString(magic, input),
+                    " contents are too large for int: ", input.size()));
     OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
                 errors::InvalidArgument(FileFormatString(magic, input),
                                         " does not support uint16 output"));
@@ -212,9 +215,10 @@ class DecodeImageOp : public OpKernel {
             input.data(), input.size(), flags, nullptr /* nwarn */,
             [=, &output](int width, int height, int channels) -> uint8* {
               Status status(context->allocate_output(
-                  0, format_ == kGifFormat
-                         ? TensorShape({1, height, width, channels})
-                         : TensorShape({height, width, channels}),
+                  0,
+                  format_ == kGifFormat
+                      ? TensorShape({1, height, width, channels})
+                      : TensorShape({height, width, channels}),
                   &output));
               if (!status.ok()) {
                 VLOG(1) << status;
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 8e9b8a7e2e7..829155fb313 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -120,9 +120,9 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
 
   VLOG(2) << "CanUseDeepConv2D"
           << " deep_conv_cost: " << deep_conv_cost
-          << " direct_conv_cost: " << direct_conv_cost
-          << " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) /
-                                        static_cast<float>(direct_conv_cost))
+          << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
+          << (static_cast<float>(deep_conv_cost) /
+              static_cast<float>(direct_conv_cost))
           << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
   return deep_conv_cost < direct_conv_cost;
 }
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 6d44a92fa3c..6497c8f3719 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -89,7 +89,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -113,14 +113,14 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                \
-REGISTER_KERNEL_BUILDER(                                           \
-    Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
-    AssignOpT<SYCLDevice, type>);
+#define REGISTER_SYCL_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      AssignOpT<SYCLDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -146,7 +146,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #endif  // end GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
+#define REGISTER_SYCL_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
@@ -156,5 +156,5 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 9347978d515..91a9587174b 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -400,7 +400,7 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop input.
     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
       const int64 input_image_size =
@@ -750,7 +750,7 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop filter.
     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
       const int64 padded_out_depth_size =
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index a5fd07fbe17..c060b2e14d2 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
     // in_depth for input and filter must match.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is depth multiplier.
     const int32 depth_multiplier = filter.dim_size(3);
@@ -430,9 +430,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T"),
+                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
 
 REGISTER_KERNEL_BUILDER(
     Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 5493e335328..126b64f73df 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/util_ptx.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "external/cub_archive/cub/util_ptx.cuh"
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
@@ -1021,7 +1021,7 @@ __global__ void __launch_bounds__(640, 2)
 
 // Device function to compute sub-warp sum reduction for a power-of-two group of
 // neighboring threads.
-template<int kWidth, typename T>
+template <int kWidth, typename T>
 __device__ __forceinline__ T WarpSumReduce(T val) {
   // support only power-of-two widths.
   assert(__popc(kWidth) == 1);
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index 86fa7dce36a..d228153d4c7 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& diagonal = context->input(0);
     const int num_dims = diagonal.dims();
-    OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument(
-        "Input must be at least rank 1, got 0"));
+    OP_REQUIRES(
+        context, 0 != num_dims,
+        errors::InvalidArgument("Input must be at least rank 1, got 0"));
     TensorShape out_shape;
     for (int i = 0; i < num_dims; ++i) {
       out_shape.AddDim(diagonal.dim_size(i));
@@ -60,10 +61,9 @@ class DiagOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, out_shape, &output_tensor));
     functor::DiagFunctor<Device, T> diagFunc;
-    Status s = diagFunc(context,
-                        diagonal.NumElements(),
-                        diagonal.flat<T>().data(),
-                        output_tensor->flat<T>().data());
+    Status s =
+        diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
+                 output_tensor->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -82,12 +82,12 @@ class DiagPartOp : public OpKernel {
                 errors::InvalidArgument("The rank of the tensor should be \
                                          even and positive, got shape ",
                                         tensor.shape().DebugString()));
-    for (int i = 0; i < out_dims; i++){
-      OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
-                  errors::InvalidArgument(
-                    "Invalid shape ", tensor.shape().DebugString(),
-                    ": dimensions ", i, " and ", i + out_dims, " do not match.")
-                  );
+    for (int i = 0; i < out_dims; i++) {
+      OP_REQUIRES(
+          context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
+          errors::InvalidArgument("Invalid shape ",
+                                  tensor.shape().DebugString(), ": dimensions ",
+                                  i, " and ", i + out_dims, " do not match."));
     }
 
     TensorShape out_shape;
@@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, out_shape, &output));
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
     functor::DiagPartFunctor<Device, T> diagPartFunc;
-    Status s = diagPartFunc(context,
-                            out_shape.num_elements(),
-                            tensor.flat<T>().data(),
-                            output->flat<T>().data());
+    Status s = diagPartFunc(context, out_shape.num_elements(),
+                            tensor.flat<T>().data(), output->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
 namespace functor {
 template <typename T>
 struct DiagFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for writing values in index range
     // [start*size, limit*size)
     auto subDiag = [in, out, size](int64 start, int64 limit) {
@@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5 * size, subDiag);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
+          subDiag);
     return Status::OK();
   }
 };
 
 template <typename T>
 struct DiagPartFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for extracting values in index range
     // [start, limit)
     auto subDiagPart = [in, out, size](int64 start, int64 limit) {
@@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5, subDiagPart);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
+          subDiagPart);
     return Status::OK();
   }
 };
 }  // namespace functor
 
-
 // Register the CPU kernels.
 #define REGISTER_DIAGOP(T)                                    \
   REGISTER_KERNEL_BUILDER(                                    \
@@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
 
 #endif  // GOOGLE_CUDA
 
-
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/diag_op.h b/tensorflow/core/kernels/diag_op.h
index c6ca6a20474..baf16ddb4b9 100644
--- a/tensorflow/core/kernels/diag_op.h
+++ b/tensorflow/core/kernels/diag_op.h
@@ -26,14 +26,14 @@ namespace functor {
 
 template <typename Device, typename T>
 struct DiagFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 template <typename Device, typename T>
 struct DiagPartFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index d3c529d784e..910f3093b23 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <complex>
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/kernels/diag_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 namespace functor {
@@ -28,10 +28,8 @@ namespace functor {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
-__global__ void DiagCudaKernel(const int num_threads,
-                               const int64 size,
-                               const T* in,
-                               T* out) {
+__global__ void DiagCudaKernel(const int num_threads, const int64 size,
+                               const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
@@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
     // so this may overflow for `size*size` in extreme cases,
     // here is checking the multiplication overflow for integer.
     if (size && (int(size * size) / size) != size) {
-      return errors::Internal(
-          "DiagOp got input size too large.");
+      return errors::Internal("DiagOp got input size too large.");
     }
     int virtual_thread_count = int(size * size);
 
     // Launch the GPU kernel.
     const GPUDevice& device = context->eigen_device<GPUDevice>();
-    CudaLaunchConfig diag_config = GetCudaLaunchConfig(
-        virtual_thread_count, device);
-    DiagCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    CudaLaunchConfig diag_config =
+        GetCudaLaunchConfig(virtual_thread_count, device);
+    DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
+                     device.stream()>>>(diag_config.virtual_thread_count, size,
+                                        in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
@@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
 template struct DiagFunctor<GPUDevice, complex64>;
 template struct DiagFunctor<GPUDevice, complex128>;
 
-
 template <typename T>
-__global__ void DiagPartCudaKernel(const int num_threads,
-                                   const int64 size,
-                                   const T* in,
-                                   T* out) {
+__global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
+                                   const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     out[index] = in[(1 + size) * index];
   }
@@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagPartFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
 
     // Extract the diagonal elements.
     CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
-    DiagPartCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
+                         0, device.stream()>>>(diag_config.virtual_thread_count,
+                                               size, in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagPartOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc
index 2d1417854cc..a708e53dd01 100644
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@@ -30,8 +30,8 @@ static Graph* Diag(int n, DataType type) {
   return g;
 }
 
-#define BM_DiagDev(N, T, TFTYPE, DEVICE)                           \
-  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {   \
+#define BM_DiagDev(N, T, TFTYPE, DEVICE)                        \
+  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {  \
     testing::UseRealTime();                                     \
     testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
     test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters);    \
@@ -51,4 +51,3 @@ BM_Diag(128);
 BM_Diag(512);
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc
index 6f5c0e91569..441a63465c8 100644
--- a/tensorflow/core/kernels/dilation_ops.cc
+++ b/tensorflow/core/kernels/dilation_ops.cc
@@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
                                       filter.shape().DebugString()));
   const int filter_rows = filter.dim_size(0);
   const int filter_cols = filter.dim_size(1);
-  OP_REQUIRES(
-      context, depth == filter.dim_size(2),
-      errors::InvalidArgument("input and filter must have the same depth: ",
-                              depth, " vs ", filter.dim_size(2)));
+  OP_REQUIRES(context, depth == filter.dim_size(2),
+              errors::InvalidArgument(
+                  "input and filter must have the same depth: ", depth, " vs ",
+                  filter.dim_size(2)));
 
   // Effective filter size, after introducing rate - 1 zeros between each
   // non-zero filter element.
@@ -234,10 +234,11 @@ class DilationBackpropInputOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed in_backprop has the same dimensions as the input:
@@ -353,10 +354,11 @@ class DilationBackpropFilterOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed filter_backprop has the same dimensions as the filter:
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index ac0775fbefe..c63806a7f68 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index a8818b7385d..b5d5b880bbb 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -29,8 +29,7 @@ template <class T>
 class DrawBoundingBoxesOp : public OpKernel {
  public:
   explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& images = context->input(0);
@@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
         int64 color_index = bb % color_table_length;
         const int64 min_box_row =
             static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp =
-            std::max<int64>(min_box_row, 0);
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
         const int64 max_box_row =
             static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
         const int64 max_box_row_clamp =
             std::min<int64>(max_box_row, height - 1);
         const int64 min_box_col =
             static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp =
-            std::max<int64>(min_box_col, 0);
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
         const int64 max_box_col =
             static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
-        const int64 max_box_col_clamp =
-            std::min<int64>(max_box_col, width - 1);
+        const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
 
         if (min_box_row > max_box_row || min_box_col > max_box_col) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is inverted and will not be drawn.";
           continue;
         }
-        if (min_box_row >= height || max_box_row < 0 ||
-            min_box_col >= width || max_box_col < 0) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+        if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
+            max_box_col < 0) {
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is completely outside the image"
                        << " and will not be drawn.";
           continue;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 861e16b2fd0..3c988db5e61 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       // Walk through data and copy the data to the appropriate output tensor
       const auto data_flat = data->flat<T>();
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_vec;
+                                   Eigen::Aligned> >
+          out_vec;
       out_vec.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_vec.push_back(outputs[p]->vec<T>());
@@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
     } else {
       // If data has extra dimensions, use Eigen slices
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_flat;
+                                   Eigen::Aligned> >
+          out_flat;
       out_flat.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_flat.push_back(outputs[p]->flat_outer_dims<T>());
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 9bb58b13f38..9dfeccff0e8 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -79,9 +79,9 @@ template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
   CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      start, delta, size, out.data());
+  RangeInitKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          start, delta, size, out.data());
 }
 
 // Given *num_runs pairs (key, value), this function moves the value
@@ -103,11 +103,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
   CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<
-      T, int32,
-      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      params, indices, out, gather_dim_size, indices_size, slice_size,
-      out_size);
+  GatherOpKernel<T, int32, true>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          params, indices, out, gather_dim_size, indices_size, slice_size,
+          out_size);
 }
 
 struct IdentityOp {
@@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     OP_REQUIRES_ASYNC(
         c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
-        errors::InvalidArgument("data.shape must start with partitions.shape, ",
-                                "got data.shape = ", data.shape().DebugString(),
-                                ", partitions.shape = ",
-                                partitions.shape().DebugString()),
+        errors::InvalidArgument(
+            "data.shape must start with partitions.shape, ",
+            "got data.shape = ", data.shape().DebugString(),
+            ", partitions.shape = ", partitions.shape().DebugString()),
         done);
 
     Tensor partition_count;
@@ -245,8 +244,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
       AllocatorAttributes alloc_attr;
       alloc_attr.set_on_host(true);
       OP_REQUIRES_OK_ASYNC(
-          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                              &partition_count, alloc_attr),
+          c,
+          c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                           &partition_count, alloc_attr),
           done);
       auto e_part_count = partition_count.flat<int32>();
       for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0;
@@ -259,8 +259,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     // Prepare for counting.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &partition_count),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &partition_count),
         done);
     Tensor indices_out;
     // Count how many times each partition index occurs.
@@ -280,8 +281,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     alloc_attr.set_on_host(true);
     alloc_attr.set_gpu_compatible(true);
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
-                            &cpu_tensor, alloc_attr),
+        c,
+        c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+                         &cpu_tensor, alloc_attr),
         done);
     perftools::gputools::DeviceMemoryBase wrapped(
         partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
@@ -340,9 +342,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Radix-sort the partition information.
     cub::DeviceRadixSort::SortPairs(
@@ -376,8 +379,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     zero_functor(device, partition_count->flat<int32>());
     // Allocate memory for aggregates_out.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &aggregates_out),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &aggregates_out),
         done);
     // Obtain the pointers to inner buffers.
     int32* keys_in_ptr = partitions_out.flat<int32>().data();
@@ -408,9 +412,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                                    num_runs_ptr, reduction_op, N, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Run reduce-by-key. The effect is that we count how many times
     // each index appears in partitions. The distinct indices are stored
diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h
index 99b4b2abe66..302033e47c5 100644
--- a/tensorflow/core/kernels/eigen_activations.h
+++ b/tensorflow/core/kernels/eigen_activations.h
@@ -21,13 +21,13 @@ limitations under the License.
 namespace Eigen {
 
 /** scalar_sigmoid_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a sigmoid
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a sigmoid
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+ */
 template <typename T>
 struct scalar_sigmoid_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
@@ -55,13 +55,13 @@ struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
 }  // namespace internal
 
 /** scalar_tanh_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a tanh
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a tanh
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+ */
 template <typename T>
 struct scalar_tanh_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
@@ -89,11 +89,11 @@ struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
 }  // namespace internal
 
 /**
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to clip the magnitude of the first scalar.
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::Clip
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
 template <typename Scalar>
 struct scalar_clip_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc
index 907233103d8..34952f5abb8 100644
--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index 3a94b8c9933..4d86f9deb99 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -21,35 +21,47 @@ limitations under the License.
 namespace Eigen {
 
 /** ExtractGlimpses
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Extract glimpses from an input tensor.
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
-  * The width and height parameters specify the extension of the returned glimpses.
-  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
-  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
-  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
-  *
-  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
-  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Extract glimpses from an input tensor.
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 4
+ * (depth, x, y, and batch). The width and height parameters specify the
+ * extension of the returned glimpses. The offsets parameter specifies the x, y
+ * locations of the center of the glimpses relative to the center of the input
+ * image. The vector is expected to contain one IndexPair for each image in the
+ * batch dimension. The normalized boolean indicates if incoming coordinates are
+ * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
+ * height and width dimension. The centered boolean indicates if incoming
+ * coordinates are centered relative to the image, in which case -1.0 and 1.0
+ * correspond to minimum and maximum of each dimension while 0.0 corresponds to
+ * the center.
+ *
+ * The result can be assigned to a tensor of rank equal to that of the input.
+ * The result will be laid out in col-major order (depth, x, y, batch). The
+ * dimensions of the result will be equal to the dimensions of the input except
+ * for width and height which will be equal to the requested glimpse size.
+ */
 namespace {
 template <typename Index>
 struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
-                      const bool normalized,
-                      const bool centered,
-                      const bool uniform_noise) :
-      width_(width), height_(height), offsets_(offsets),
-      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+                      const bool normalized, const bool centered,
+                      const bool uniform_noise)
+      : width_(width),
+        height_(height),
+        offsets_(offsets),
+        normalized_(normalized),
+        centered_(centered),
+        uniform_noise_(uniform_noise) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
 
     DSizes<Index, 4> dims = in.dimensions();
@@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  EIGEN_DEVICE_FUNC
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
+                              const Device& device) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
     const Index num_channels = in.dimension(0);
     const Index input_width = in.dimension(1);
@@ -97,8 +109,8 @@ struct GlimpseExtractionOp {
       x -= width_ / 2.0f;
       y -= height_ / 2.0f;
 
-      const Index offset_x = (Index) x;
-      const Index offset_y = (Index) y;
+      const Index offset_x = (Index)x;
+      const Index offset_y = (Index)y;
       Index glimpse_width = width_;
       Index glimpse_height = height_;
       bool partial_overlap = false;
@@ -135,7 +147,7 @@ struct GlimpseExtractionOp {
         if (uniform_noise_) {
           // Initialize the glimpse with uniform noise.
           typedef typename internal::remove_const<
-            typename internal::traits<Input>::Scalar>::type Scalar;
+              typename internal::traits<Input>::Scalar>::type Scalar;
           TensorFixedSize<Scalar, Sizes<> > mini;
           mini.device(device) = input.template chip<3>(i).minimum();
           TensorFixedSize<float, Sizes<> > range;
@@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
   const bool centered_;
   const bool uniform_noise_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
+    const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
+    const Input>
 ExtractGlimpses(const Input& input,
                 const typename internal::traits<Input>::Index width,
                 const typename internal::traits<Input>::Index height,
                 const std::vector<IndexPair<float> >& offsets,
                 const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+                const bool uniform_noise = true) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
@@ -237,6 +250,6 @@ ExtractGlimpses(const Input& input,
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc
index 3a2eeb05959..08f61877182 100644
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenAttentionTest, Simple) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index aec76978102..099696105b6 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -21,29 +21,29 @@ limitations under the License.
 namespace Eigen {
 
 /** SpatialConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_in_stride > 1, then applies convolution with holes
+ *
+ * If row_in_stride, col_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 #ifdef EIGEN_HAS_INDEX_LIST
 typedef IndexList<type2index<0>, type2index<0>, type2index<1>, type2index<1> >
     ReverseColMajor;
@@ -293,29 +293,29 @@ SpatialConvolutionBackwardInput(
 }
 
 /** SpatialConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
+ *
+ * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
  * atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 
 template <typename OutputBackward, typename Input>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
index 1758067829e..2229ec96594 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest,
      test_simple_spatial_convolution_backward_input_valid) {
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 972036833ff..896c9957616 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -309,10 +309,10 @@ struct AvgPoolMeanReducer {
   _mm512_castsi512_ps( \
       _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
 
-// The ternarylogic function immediate determines the values in the result
-// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
-// For details, refer to the vpternlogd instruction table at
-// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+  // The ternarylogic function immediate determines the values in the result
+  // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+  // For details, refer to the vpternlogd instruction table at
+  // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
 
 #define psel(a, b, false_mask)                        \
   _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc
index 9383972b9ff..47b6665e680 100644
--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenPoolingTest, Simple) {
   const int depth = 10;
diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h
index a2930a726f9..12148c54b36 100644
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ b/tensorflow/core/kernels/eigen_softmax.h
@@ -21,19 +21,21 @@ limitations under the License.
 namespace Eigen {
 
 /** SoftMax
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a softmax
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
-  *
-  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
-  *
-*/
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a softmax
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 2
+ * (depth and other).
+ *
+ * The result can be assigned to a tensor of rank and dimensions equal to that
+ * of the input. The result will be laid out in col-major order.
+ *
+ */
 
 namespace {
 struct SoftmaxOp {
-  SoftmaxOp(const float beta) : beta_(beta) { }
+  SoftmaxOp(const float beta) : beta_(beta) {}
 
   template <typename Input>
   typename Input::Dimensions dimensions(const Input& input) const {
@@ -41,8 +43,7 @@ struct SoftmaxOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  void eval(const Input& input, Output& output, const Device& device) const {
 #if !defined(EIGEN_HAS_INDEX_LIST)
     // nvcc doesn't support cxx11
     Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
@@ -56,35 +57,43 @@ struct SoftmaxOp {
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
-    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
+    Eigen::IndexList<Eigen::type2index<0> > depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1> > bcast;
     bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    Eigen::IndexList<Eigen::type2index<1>,
+                     typename internal::traits<Input>::Index>
+        dims2d;
     dims2d.set(1, dimensions(input)[1]);
 #endif
 
-    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
-    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+    output.device(device) =
+        ((input -
+          input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
+         beta_)
+            .exp();
+    output.device(device) =
+        output /
+        (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
   }
 
  private:
   const float beta_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
-SoftMax(const Input& input, const float beta)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
+                                                     const Input>
+SoftMax(const Input& input, const float beta) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   const SoftmaxOp op(beta);
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc
index ba681d68ab0..7f985d71366 100644
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenSoftmaxTest, Simple) {
   const int depth = 1024;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 2fe64cd72ac..1acbe3a6580 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -877,29 +877,29 @@ struct gemm_pack_rhs<
 }  // end namespace internal
 
 /** SpatialConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 2D convolution over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 3 or more
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
  * (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The input and the kernel must both be in col-major layout. The result will
+ * The input and the kernel must both be in col-major layout. The result will
  * also be in col-major layout.
-  *
-  * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be filters, height, width (and
  * others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 template <typename Input, typename Kernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
@@ -993,7 +993,7 @@ EIGEN_DEVICE_FUNC
     default:
       // Initialize unused variables to avoid a compiler warning
       out_height = 0;
-      out_width  = 0;
+      out_width = 0;
       eigen_assert(false && "unexpected padding");
   }
 
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 4fcae25aa6e..1a5b0f2b675 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -80,10 +80,11 @@ class EncodeJpegOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(context, FastBoundsCheck(image.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument(
-                    "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument(
+            "Cannot encode images with >= max int32 elements"));
 
     const int32 dim_size0 = static_cast<int32>(image.dim_size(0));
     const int32 dim_size1 = static_cast<int32>(image.dim_size(1));
@@ -100,9 +101,10 @@ class EncodeJpegOp : public OpKernel {
       } else if (channels == 3) {
         adjusted_flags.format = jpeg::FORMAT_RGB;
       } else {
-        OP_REQUIRES(context, false, errors::InvalidArgument(
-                                        "image must have 1 or 3 channels, got ",
-                                        image.shape().DebugString()));
+        OP_REQUIRES(
+            context, false,
+            errors::InvalidArgument("image must have 1 or 3 channels, got ",
+                                    image.shape().DebugString()));
       }
     } else {
       if (flags_.format == jpeg::FORMAT_GRAYSCALE) {
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 268a059275a..83cd0e9b47e 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -346,8 +346,9 @@ class SingleSequenceExampleParserOp : public OpKernel {
           feature_list_sparse_keys[di].scalar<string>()();
     }
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(
-                 feature_list_dense_missing_assumed_empty->shape()),
+        ctx,
+        TensorShapeUtils::IsVector(
+            feature_list_dense_missing_assumed_empty->shape()),
         errors::InvalidArgument(
             "Expected feature_list_dense_missing_assumed_empty ",
             "to be a vector, got shape: ",
@@ -386,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
       required[d] = (def_value.NumElements() == 0);  // No default provided.
 
       if (def_value.NumElements() > 0) {
-        OP_REQUIRES(
-            ctx, def_value.shape() == attrs_.context_dense_shapes[d],
-            errors::InvalidArgument(
-                "def_value[", d, "].shape() == ",
-                def_value.shape().DebugString(), " != context_dense_shapes_[",
-                d, "] == ", attrs_.context_dense_shapes[d].DebugString()));
+        OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
+                    errors::InvalidArgument(
+                        "def_value[", d,
+                        "].shape() == ", def_value.shape().DebugString(),
+                        " != context_dense_shapes_[", d,
+                        "] == ", attrs_.context_dense_shapes[d].DebugString()));
         OP_REQUIRES(
             ctx, def_value.dtype() == attrs_.context_dense_types[d],
             errors::InvalidArgument(
@@ -576,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
         const Feature& f = fl.feature(t);
         bool types_match;
         OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
-        OP_REQUIRES(
-            ctx, types_match,
-            errors::InvalidArgument(
-                "Name: ", name, ", Feature list: ", key, ", Index: ", t,
-                ".  Data types don't match. ", "Expected type: ",
-                DataTypeString(dtype), "  Feature is: ", ProtoDebugString(f)));
+        OP_REQUIRES(ctx, types_match,
+                    errors::InvalidArgument(
+                        "Name: ", name, ", Feature list: ", key, ", Index: ", t,
+                        ".  Data types don't match. ",
+                        "Expected type: ", DataTypeString(dtype),
+                        "  Feature is: ", ProtoDebugString(f)));
         OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
                                              feature_list_dense_values[d]));
       }
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index 4fbf76d2d0d..4a1aa433bc9 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -122,13 +122,9 @@ static string D(const char* s) {
   return ret;
 }
 
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("Yoxmos").c_str()),
-                        FactOpKernel2);
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("yoxmos").c_str()),
-                        FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 5953db14768..af3a42135d1 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -378,9 +378,8 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
   Tensor* output = GetOutput(0);
   auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, input_flat(1), input_flat(2),
-                     input_flat(3), input_flat(4), 0.0f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
@@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2270,14 +2265,13 @@ TEST_F(QuantOpsTest,
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
                                   TensorShape({1, 2, 3, 4}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(
-      &expected_bprop_wrt_input,
-      {0.0f, grad_flat(1), grad_flat(2), 0.0f,
-       0.0f, grad_flat(5), grad_flat(6), 0.0f,
-       0.0f, grad_flat(9), grad_flat(10), 0.0f,
-       0.0f, grad_flat(13), grad_flat(14), 0.0f,
-       0.0f, grad_flat(17), grad_flat(18), 0.0f,
-       0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 82ec8791198..479f7be4b50 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -255,97 +255,96 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int64 queue_size = queues_[0].size();
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int64 queue_size = queues_[0].size();
 
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the front of the
-                    // queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to FIFOQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_front(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "FIFOQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
-                    }
-                    return kComplete;
-                  }
-                }
-
-                RunResult result = kNoProgress;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the front of the
+                // queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
                       attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to FIFOQueue: ",
+                                           s.error_message()));
                     }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int64 index = attempt->tuple[0].dim_size(0) -
-                                      attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
+                    queues_[j].push_front(element);
                   }
                 }
-                return result;
-              });
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "FIFOQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
+
+            RunResult result = kNoProgress;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
+                }
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int64 index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index bde39770dee..7090417dfdb 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 
@@ -60,7 +60,7 @@ DEFINE_SETZERO_CPU(Variant);
 template <typename T>
 void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
     const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-      To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  To32Bit(out).device(d) = To32Bit(out).constant(T(0));
 }
 
 #define DEFINE_SETZERO_SYCL(T) \
@@ -118,7 +118,8 @@ DEFINE_SETONE_SYCL(double);
 
 template <typename T>
 struct FillFunctor<Eigen::ThreadPoolDevice, T> {
-  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out,
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out,
                   typename TTypes<T>::ConstScalar in) {
     out.device(d) = out.constant(in());
   }
@@ -150,8 +151,7 @@ struct FillFunctor<Eigen::SyclDevice, T> {
   }
 };
 
-#define DEFINE_FILL_SYCL(T) \
-  template struct FillFunctor<Eigen::SyclDevice, T>;
+#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
 DEFINE_FILL_SYCL(float);
 DEFINE_FILL_SYCL(double);
 TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 47f4189c30f..135d0023458 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -232,8 +232,9 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Grab the inputs.
     const Tensor& orig_input_tensor_shape = context->input(0);
-    OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 &&
-                             orig_input_tensor_shape.NumElements() == 4,
+    OP_REQUIRES(context,
+                orig_input_tensor_shape.dims() == 1 &&
+                    orig_input_tensor_shape.NumElements() == 4,
                 errors::InvalidArgument("original input tensor shape must be"
                                         "1-dimensional and 4 elements"));
     const Tensor& out_backprop = context->input(1);
diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h
index df0bbbfa066..2d7a230fc00 100644
--- a/tensorflow/core/kernels/fractional_pool_common.h
+++ b/tensorflow/core/kernels/fractional_pool_common.h
@@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) {
 //     * sum(generated_diff_pooling_sequence) = input_length
 //     * Let's define floor(input_length / output_length) = K, then
 //       K <= generated_diff_pooling_sequence[i] <= K+1
-// For example, when input_length = 10, output_length = 6, the followings are
+// For example, when input_length = 10, output_length = 6, the following are
 // valid pooling sequence:
 //     * [1, 2, 2, 1, 2, 2]
 //     * [1, 1, 2, 2, 2, 2]
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index ef9e8484132..9d4bc35ba89 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -253,22 +253,21 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    lib->Run(
-        opts, handle, args, rets, [ctx, done, rets](const Status& status) {
-          if (!status.ok()) {
-            ctx->SetStatus(status);
-          } else if (rets->size() != ctx->num_outputs()) {
-            ctx->SetStatus(errors::InvalidArgument(
-                "SymGrad expects to return ", ctx->num_outputs(),
-                " tensor(s), but get ", rets->size(), " tensor(s) instead."));
-          } else {
-            for (size_t i = 0; i < rets->size(); ++i) {
-              ctx->set_output(i, (*rets)[i]);
-            }
-          }
-          delete rets;
-          done();
-        });
+    lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+      } else if (rets->size() != ctx->num_outputs()) {
+        ctx->SetStatus(errors::InvalidArgument(
+            "SymGrad expects to return ", ctx->num_outputs(),
+            " tensor(s), but get ", rets->size(), " tensor(s) instead."));
+      } else {
+        for (size_t i = 0; i < rets->size(); ++i) {
+          ctx->set_output(i, (*rets)[i]);
+        }
+      }
+      delete rets;
+      done();
+    });
   }
 
  private:
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index a8484390b92..4a67b2b3a30 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -68,7 +68,8 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
 template <class T>
 void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
                                   typename TTypes<T>::Flat out) {
-  To32Bit(out).device(d) = To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+  To32Bit(out).device(d) =
+      To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
 }
 
 template class VarianceToInvVariance<float>;
diff --git a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
index 6d4a9dfdef4..37edd1ce0f9 100644
--- a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
index b084a972049..f3b24b2341e 100644
--- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
index 9dd795b94e8..e9ffad17861 100644
--- a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
index 4a68a5b5803..020f18b1895 100644
--- a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
index 2d6c82826cf..a8f07f4bad3 100644
--- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
index 81b6e491248..f5dd47a052c 100644
--- a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index d91a351c596..4d736a21602 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 0c0e548a909..f1f3f199df1 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
-#define LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/graph/graph.h"
@@ -153,4 +153,4 @@ class FuzzStringInputOp : public FuzzSession {
 }  // end namespace fuzzing
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
index ac3a12aa399..5c3fc4a2795 100644
--- a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index 978fcd10282..c90ad2cfeb7 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 7d1aa1fbf3a..738d78e99a0 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
index 94255d215e5..e98363ffbf1 100644
--- a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index dde08b37eac..e6fefe643b7 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_INDEX(T, Index)                             \
-  template <>                                                         \
-  int64 GatherFunctor<GPUDevice, T, Index>::operator()(               \
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                               \
+  template <>                                                           \
+  int64 GatherFunctor<GPUDevice, T, Index>::operator()(                 \
       OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor Tparams, \
-      typename TTypes<Index>::ConstFlat Tindices,                     \
-      typename TTypes<T, 3>::Tensor Tout);                            \
+      typename TTypes<Index>::ConstFlat Tindices,                       \
+      typename TTypes<T, 3>::Tensor Tout);                              \
   extern template struct GatherFunctor<GPUDevice, T, Index>;
 
 #define DECLARE_GPU_SPECS(T)         \
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 1e429a037e8..16ccb03b850 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -52,21 +52,23 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   const size_t slice_bytes = slice_elems * sizeof(T);
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
-  // Store the value of invalidate index for printing error information, it's a shared variable.
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
   SliceIndex result = -1;
-  auto work = [&] (int64 start, int64 end) {
+  auto work = [&](int64 start, int64 end) {
     SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
     SliceIndex indices_idx = static_cast<SliceIndex>(start % indices_size);
     SliceIndex batch_idx_end = static_cast<SliceIndex>(end / indices_size);
     SliceIndex indices_idx_end = static_cast<SliceIndex>(end % indices_size);
 
     while ((batch_idx < batch_idx_end) ||
-            (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
+           (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
       SliceIndex i_next = indices_idx + 1;
       SliceIndex b_next = batch_idx + 1;
       if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
-              (i_next < indices_size)) {
-        port::prefetch<port::PREFETCH_HINT_T0>(&params(batch_idx, indices(i_next), 0));
+          (i_next < indices_size)) {
+        port::prefetch<port::PREFETCH_HINT_T0>(
+            &params(batch_idx, indices(i_next), 0));
         port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
         b_next = batch_idx;
       } else if (b_next <= batch_idx_end) {
@@ -85,11 +87,12 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
       // ahead-of-time compilation binary size).
       if (is_simple_type<T>::value) {
         // Avoid auto-promotion to Index from SliceIndex by casting.
-        memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
-               params_base + (batch_idx * static_cast<SliceIndex>(limit) +
-                              static_cast<SliceIndex>(index)) *
-                             slice_elems,
-               slice_bytes);
+        memcpy(
+            out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
+            params_base + (batch_idx * static_cast<SliceIndex>(limit) +
+                           static_cast<SliceIndex>(index)) *
+                              slice_elems,
+            slice_bytes);
       } else {
         // For non-"simple" types (e.g. strings).
         out.template chip<1>(indices_idx) = params.template chip<1>(index);
@@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
     }
   };
 
-  Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size,
-        slice_elems * sizeof(T), work);
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * indices_size, slice_elems * sizeof(T), work);
   return result;
 }
 
@@ -117,16 +120,16 @@ struct GatherFunctorCPU {
     bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
                       params.size() > std::numeric_limits<int32>::max() ||
                       N > std::numeric_limits<int32>::max());
-#define CALL(elems)                                                        \
-  do {                                                                     \
-    if (use_large) {                                                       \
-      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices,   \
-                                                   slice_size, out);       \
-    } else {                                                               \
-      const int32 small_slice = static_cast<int32>(slice_size);            \
-      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices,   \
-                                                   small_slice, out);      \
-    }                                                                      \
+#define CALL(elems)                                                      \
+  do {                                                                   \
+    if (use_large) {                                                     \
+      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices, \
+                                                   slice_size, out);     \
+    } else {                                                             \
+      const int32 small_slice = static_cast<int32>(slice_size);          \
+      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices, \
+                                                   small_slice, out);    \
+    }                                                                    \
   } while (0)
 
     if (slice_size == 10)
@@ -143,7 +146,8 @@ struct GatherFunctorCPU {
 
 template <typename Device, typename T, typename Index>
 struct GatherFunctor {
-  int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params,
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
                    typename TTypes<T, 3>::Tensor out);
 };
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 239d5d2e990..08adf4badbc 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/mem.h"
@@ -106,8 +108,7 @@ class GatherOp : public OpKernel {
       auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
-                            indices_flat, out_flat);
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index f0d7c670a62..4040bf52bff 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -46,7 +46,7 @@ GraphTransferUtils::GetTopNFloatResults(const float* const data,
       GetTopNFloatResults(data, labels, element_count);
   LOG(INFO) << "=== Dump ranking ===";
   for (int i = 0; i < top_n; ++i) {
-    const std::tuple<float, int, string> &entry = queue.top();
+    const std::tuple<float, int, string>& entry = queue.top();
     LOG(INFO) << i << ": " << std::get<1>(entry) << ", " << std::get<2>(entry)
               << ", " << std::get<0>(entry);
     queue.pop();
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index a360d188cc2..0d43d028cdb 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -181,8 +181,8 @@ class GraphTransferer {
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
 
-  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
-                              const int id, const Node& node);
+  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner, const int id,
+                              const Node& node);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> BuildShapeArray(
       const shape_inference::ShapeHandle& shape_handle,
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 536d295506c..20b09f144ba 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -42,8 +42,7 @@ constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
 
 class GraphTransfererTest : public ::testing::Test {
  protected:
-  void SetUp() final {
-  }
+  void SetUp() final {}
 
   GraphTransferer gt_;
 };
@@ -61,7 +60,7 @@ class TestGraphTransferOpsDefinitions : public IRemoteFusedGraphOpsDefinitions {
       }
     }
     return -1;
-}
+  }
 
  private:
   const std::vector<string> op_types_{"INPUT",   "OUTPUT",  "Conv2D",
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 71bc4187b74..3f794dfb1a0 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -420,7 +420,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -487,7 +487,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -556,7 +556,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi0 = gt0.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
@@ -576,7 +576,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi1 = gt1.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index 789a7ce7a3d..d303e9c877e 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
     // valid value for new dual = 0
     // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
     const double candidate_optimal_dual =
-        current_dual +
-        (label - wx) /
-            (num_loss_partitions * example_weight * weighted_example_norm);
+        current_dual + (label - wx) / (num_loss_partitions * example_weight *
+                                       weighted_example_norm);
     if (label * candidate_optimal_dual < 0) {
       return 0.0;
     }
diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
index c2bb958be8b..a88e9b0ddcd 100644
--- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/histogram_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "external/cub_archive/cub/device/device_histogram.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/histogram_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
         /* num_samples */ num_samples,
         /* stream */ stream);
     if (err != cudaSuccess) {
-      return errors::Internal("Could not launch HistogramRange: ",
-                              cudaGetErrorString(err), ".");
+      return errors::Internal(
+          "Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
     }
 
     return Status::OK();
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 1db9263e5d3..a18a72c66dc 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -128,6 +128,7 @@ REGISTER_GPU_KERNEL(Variant);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
+REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index f088315ff53..faf997be05c 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -109,8 +109,9 @@ struct ImageResizerState {
     ValidateAndCalculateOutputSize(context, input);
     if (!context->status().ok()) return;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), out_height,
-                                                out_width, input.dim_size(3)}),
+                                0,
+                                TensorShape({input.dim_size(0), out_height,
+                                             out_width, input.dim_size(3)}),
                                 &output));
   }
 
@@ -168,8 +169,9 @@ struct ImageResizerGradientState {
         CalculateResizeScale(original_width, resized_width, align_corners_);
     output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({batch_size, original_height,
-                                                original_width, channels}),
+                                0,
+                                TensorShape({batch_size, original_height,
+                                             original_width, channels}),
                                 &output));
   }
 
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index e2861ae090c..c37055239c2 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -98,36 +98,36 @@ class InTopK : public OpKernel {
   int k_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
 
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 7728ba850c9..a71d047ed1a 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -27,13 +27,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SyclDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
 template <typename Device, typename T>
-Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
-                              int32 loc, Tensor* output) {
+Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
+                              Tensor* output) {
   auto Tvalue = value.shaped<T, 2>({1, value.NumElements()});
   auto Toutput = output->flat_outer_dims<T>();
   auto nrows = Toutput.dimension(0);
@@ -74,7 +74,7 @@ Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -207,7 +207,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         ParallelConcatUpdate<CPUDevice>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index f8ed9351579..f561287f7a1 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/l2loss_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 36907fb5716..b58bcf58348 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -108,7 +108,6 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
-
 }
 
 template <typename Scalar>
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 31a427f2c90..2474fe4d564 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 #include <sys/stat.h>
@@ -26,9 +26,8 @@ namespace tensorflow {
 
 class LMDBReader : public ReaderBase {
  public:
-  LMDBReader(const string& node_name, Env* env)
+  LMDBReader(const string& node_name, Env* /*unused*/)
       : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
-        env_(env),
         mdb_env_(nullptr),
         mdb_dbi_(0),
         mdb_txn_(nullptr),
@@ -77,15 +76,13 @@ class LMDBReader : public ReaderBase {
         *at_end = true;
         return Status::OK();
       }
-    }
-    else {
+    } else {
       if (Seek(MDB_NEXT) == false) {
         *at_end = true;
         return Status::OK();
       }
     }
-    *key = string(static_cast<const char*>(mdb_key_.mv_data),
-                  mdb_key_.mv_size);
+    *key = string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
     *value = string(static_cast<const char*>(mdb_value_.mv_data),
                     mdb_value_.mv_size);
     *produced = true;
@@ -109,7 +106,6 @@ class LMDBReader : public ReaderBase {
     }
   }
 
-  Env* const env_;
   MDB_env* mdb_env_;
   MDB_dbi mdb_dbi_;
 
@@ -123,13 +119,10 @@ class LMDBReaderOp : public ReaderOpKernel {
   explicit LMDBReaderOp(OpKernelConstruction* context)
       : ReaderOpKernel(context) {
     Env* env = context->env();
-    SetReaderFactory([this, env]() {
-      return new LMDBReader(name(), env);
-    });
+    SetReaderFactory([this, env]() { return new LMDBReader(name(), env); });
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
-                        LMDBReaderOp);
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), LMDBReaderOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 2765f42bbdc..6479e6f5dc3 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -122,10 +122,9 @@ class LogisticLossUpdater : public DualLossUpdater {
                              num_loss_partitions * weighted_example_norm *
                                  example_weight *
                                  (0.5 * (1 + tanhx) / label - current_dual);
-    const double denominator = -2 * label -
-                               num_loss_partitions * weighted_example_norm *
-                                   example_weight * (1 - tanhx * tanhx) * 0.5 /
-                                   label;
+    const double denominator =
+        -2 * label - num_loss_partitions * weighted_example_norm *
+                         example_weight * (1 - tanhx * tanhx) * 0.5 / label;
     return x - numerator / denominator;
   }
 };
diff --git a/tensorflow/core/kernels/loss_test.cc b/tensorflow/core/kernels/loss_test.cc
index 89f0677e1f5..460d65c5c27 100644
--- a/tensorflow/core/kernels/loss_test.cc
+++ b/tensorflow/core/kernels/loss_test.cc
@@ -32,14 +32,17 @@ namespace {
 
 TEST(LogisticLoss, ComputePrimalLoss) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.693147, loss_updater.ComputePrimalLoss(
-                            0 /* wx */, 1 /* label */, 1 /* example weight */),
+  EXPECT_NEAR(0.693147,
+              loss_updater.ComputePrimalLoss(0 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
+                                             1 /* example weight */),
               1e-3);
 }
 
@@ -53,31 +56,35 @@ TEST(LogisticLoss, ComputeDualLoss) {
               loss_updater.ComputeDualLoss(1 /* current dual */, 1 /* label */,
                                            1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(-0.693147, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                      1 /* label */,
-                                                      1 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -0.693147,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1 /* label */,
+                                   1 /* example weight */),
+      1e-3);
 }
 
 TEST(LogisticLoss, ComputeUpdatedDual) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.479, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.479,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.031, loss_updater.ComputeUpdatedDual(
-                          2 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, 0.1 /* current_dual */,
-                          -0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.031,
+              loss_updater.ComputeUpdatedDual(
+                  2 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, 0.1 /* current_dual */,
+                  -0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SquaredLoss, ComputePrimalLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(40.5,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -95,43 +102,50 @@ TEST(SquaredLoss, ComputePrimalLoss) {
 
 TEST(SquaredLoss, ComputeDualLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(0.66, loss_updater.ComputeDualLoss(0.2 /* current dual */,
-                                                 -1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-0.375, loss_updater.ComputeDualLoss(1.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      0.66,
+      loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -0.375,
+      loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SquaredLoss, ComputeUpdatedDual) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(HingeLoss, ComputePrimalLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(1.0, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(1.0,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -149,10 +163,11 @@ TEST(HingeLoss, ComputePrimalLoss) {
 
 TEST(HingeLoss, ComputeDualLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -163,10 +178,11 @@ TEST(HingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.5, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                 1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.5,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(HingeLoss, ConvertLabel) {
@@ -195,28 +211,31 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // weighted_example_norm=100.0, it turns out that the optimal value to update
   // the dual to is 0.507 which is within the permitted range and thus should be
   // the value returned.
-  EXPECT_NEAR(0.507, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 100.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.507,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 100.0 /* weighted_example_norm */),
               1e-3);
   // When label=-1.0, example_weight=1.0, current_dual=0.4, wx=0.6,
   // weighted_example_norm=10.0 and num_loss_partitions=10, it turns out that
   // the optimal value to update the dual to is 0.384 which is within the
   // permitted range and thus should be the value returned.
-  EXPECT_NEAR(-0.416, loss_updater.ComputeUpdatedDual(
-                          10 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.6 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.416,
+              loss_updater.ComputeUpdatedDual(
+                  10 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.6 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
   // When label=1.0, example_weight=1.0, current_dual=-0.5, wx=0.3 and
   // weighted_example_norm=10.0, it turns out that the optimal value to update
   // the dual to is -0.43. However, this is outside the allowed [0.0, 1.0] range
   // and hence the closest permitted value (0.0) should be returned instead.
-  EXPECT_NEAR(0.0, loss_updater.ComputeUpdatedDual(
-                       1 /* num partitions */, 1.0 /* label */,
-                       1.0 /* example weight */, -0.5 /* current_dual */,
-                       0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, -0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
   // When label=-1.0, example_weight=2.0, current_dual=-1.0, wx=0.3 and
@@ -224,17 +243,19 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // the dual to is -1.065. However, this is outside the allowed [-1.0, 0.0]
   // range and hence the closest permitted value (-1.0) should be returned
   // instead.
-  EXPECT_NEAR(-1.0, loss_updater.ComputeUpdatedDual(
-                        1 /* num partitions */, -1.0 /* label */,
-                        2.0 /* example weight */, -1.0 /* current_dual */,
-                        0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-1.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, -1.0 /* label */,
+                  2.0 /* example weight */, -1.0 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputePrimalLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -252,10 +273,11 @@ TEST(SmoothHingeLoss, ComputePrimalLoss) {
 
 TEST(SmoothHingeLoss, ComputeDualLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -266,24 +288,27 @@ TEST(SmoothHingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputeUpdatedDual) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c905ebc84a6..c3a59c95762 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -229,10 +229,11 @@ class LRNOp : public OpKernel {
   explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
@@ -247,9 +248,10 @@ class LRNOp : public OpKernel {
     const Tensor& in = context->input(0);
     OP_REQUIRES(context, in.dims() == 4,
                 errors::InvalidArgument("in must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(in.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
     // Cast to platform-specific int to avoid conversion warnings.
     const int batch = static_cast<int>(in.dim_size(0));
     const int rows = static_cast<int>(in.dim_size(1));
@@ -448,10 +450,11 @@ class LRNGradOp : public OpKernel {
   explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 5eb060f6641..cdff7bad5fe 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -45,15 +45,14 @@ class MatchingFilesOp : public OpKernel {
     int num_files = 0;
     std::vector<std::vector<string>> all_fnames(num_patterns);
     for (int i = 0; i < num_patterns; i++) {
-      OP_REQUIRES_OK(
-          context,
-          context->env()->GetMatchingPaths(patterns(i), &all_fnames[i]));
+      OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i),
+                                                               &all_fnames[i]));
       num_files += all_fnames[i].size();
     }
     Tensor* output_t = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       "filenames", TensorShape({num_files}), &output_t));
+    OP_REQUIRES_OK(
+        context, context->allocate_output("filenames", TensorShape({num_files}),
+                                          &output_t));
     auto output = output_t->vec<string>();
     int index = 0;
     for (int i = 0; i < num_patterns; ++i) {
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index cb68690f284..f499ce6519d 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -261,12 +261,12 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
       std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
     using perftools::gputools::blas::AlgorithmConfig;
     using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
     using perftools::gputools::blas::kDefaultAlgorithm;
     using perftools::gputools::blas::kDefaultBlasGemm;
     using perftools::gputools::blas::kDefaultBlasGemv;
     using perftools::gputools::blas::kNoAlgorithm;
+    using perftools::gputools::blas::ProfileResult;
+    using perftools::gputools::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 6398da2fb95..628895ca86f 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -30,7 +30,8 @@ struct MatMulTypes {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
       out_type;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                           Eigen::Aligned> in_type;
+                           Eigen::Aligned>
+      in_type;
 };
 
 template <typename Device, typename In0, typename In1, typename Out,
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index d7fff4bb0c2..1439141f649 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -62,7 +62,15 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in.shape()),
                 errors::InvalidArgument("num_lower must be scalar, got shape ",
                                         num_lower_in.shape().DebugString()));
-    const int64 num_lower = num_lower_in.scalar<int64>()();
+
+    auto as_int64_scalar = [](const Tensor& tensor) -> int64 {
+      if (tensor.dtype() == DT_INT32) {
+        return tensor.scalar<int32>()();
+      } else {
+        return tensor.scalar<int64>()();
+      }
+    };
+    const int64 num_lower = as_int64_scalar(num_lower_in);
     OP_REQUIRES(
         context, num_lower <= input_reshaped.dimension(1),
         errors::InvalidArgument(
@@ -73,7 +81,7 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in.shape()),
                 errors::InvalidArgument("num_upper must be scalar, got shape ",
                                         num_upper_in.shape().DebugString()));
-    const int64 num_upper = num_upper_in.scalar<int64>()();
+    const int64 num_upper = as_int64_scalar(num_upper_in);
     OP_REQUIRES(context, num_upper <= input_reshaped.dimension(2),
                 errors::InvalidArgument("num_upper must be negative or less or "
                                         "equal to number of columns (",
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc
index 4cc3f32f7e4..99db8983013 100644
--- a/tensorflow/core/kernels/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/matrix_exponential_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.exp();
   }
@@ -51,9 +51,9 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<float>), float);
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<double>), double);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/matrix_logarithm_op.cc
index cf0007b5b67..22ca094e243 100644
--- a/tensorflow/core/kernels/matrix_logarithm_op.cc
+++ b/tensorflow/core/kernels/matrix_logarithm_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.log();
   }
@@ -53,9 +53,9 @@ class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
 // logarithm. If all eigenvalues are positive, then this returns the correct
 // logarithm, however checking for positive definiteness adds significant
 // overhead. Therefore at present we only register this Op for complex types.
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixLogarithm",
-                   (MatrixLogarithmOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 9dd665392bc..502d593474e 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -69,8 +69,8 @@ class MatrixSetDiagOp : public OpKernel {
                 errors::InvalidArgument(
                     "must have diagonal.shape == input.shape[:-2] + "
                     "min(input.shape[-2:]), but received input shape: ",
-                    input_shape.DebugString(), " and diagonal shape: ",
-                    diag_shape.DebugString()));
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
 
     if (input.NumElements() == 0) {
       // This is a no-op.
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 2eefadad494..9be7408012b 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/maxpooling_op.h"
 
 #include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -89,7 +89,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
   //    max value.
   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
                 &output_arg_max, &out_backprop](int64 start, int64 limit) {
-
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
@@ -180,7 +179,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
-
   };
 
   const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
@@ -567,7 +565,7 @@ class MaxPoolingGradGradOp : public OpKernel {
     //    tensor_out_as_matrix with the corresponding values in
     //    top_diff_as_matrix.
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f8daaca4c94..0c7a236b2ff 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -450,10 +450,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-    kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                     kThreadsPerBlock, 0, d.stream()>>>(
-                                        output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 9fed01189fc..39e60c9fcef 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -98,9 +98,9 @@ typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
 template <typename Context, typename Params>
 void MultiThreadGemm(Context* context, const Params& params) {
   if (params.m <= 4) {
-      gemmlowp::meta::MultiThreadGemm<
-          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
-          1, 8, 8>(context, params);
+    gemmlowp::meta::MultiThreadGemm<
+        Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, 1,
+        8, 8>(context, params);
   } else {
     if (params.m >= params.n) {
       gemmlowp::meta::MultiThreadGemm<
diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc
index 2793005aa26..8c755e0df87 100644
--- a/tensorflow/core/kernels/mfcc.cc
+++ b/tensorflow/core/kernels/mfcc.cc
@@ -27,21 +27,19 @@ const double kFilterbankFloor = 1e-12;
 const int kDefaultFilterbankChannelCount = 40;
 const int kDefaultDCTCoefficientCount = 13;
 
-Mfcc::Mfcc() : initialized_(false),
-               lower_frequency_limit_(kDefaultLowerFrequencyLimit),
-               upper_frequency_limit_(kDefaultUpperFrequencyLimit),
-               filterbank_channel_count_(kDefaultFilterbankChannelCount),
-               dct_coefficient_count_(kDefaultDCTCoefficientCount) { }
+Mfcc::Mfcc()
+    : initialized_(false),
+      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+      filterbank_channel_count_(kDefaultFilterbankChannelCount),
+      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
 
-bool Mfcc::Initialize(int input_length,
-                      double input_sample_rate) {
-  bool initialized = mel_filterbank_.Initialize(input_length,
-                                                input_sample_rate,
-                                                filterbank_channel_count_,
-                                                lower_frequency_limit_,
-                                                upper_frequency_limit_);
-  initialized &= dct_.Initialize(filterbank_channel_count_,
-                                 dct_coefficient_count_);
+bool Mfcc::Initialize(int input_length, double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(
+      input_length, input_sample_rate, filterbank_channel_count_,
+      lower_frequency_limit_, upper_frequency_limit_);
+  initialized &=
+      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
   initialized_ = initialized;
   return initialized;
 }
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index 8268f472034..8eee76f7f0c 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -20,18 +20,17 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mfcc_dct.h"
 #include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 
 class Mfcc {
  public:
   Mfcc();
-  bool Initialize(int input_length,
-                  double input_sample_rate);
+  bool Initialize(int input_length, double input_sample_rate);
 
   // Input is a single squared-magnitude spectrogram frame. The input spectrum
   // is converted to linear magnitude and weighted into bands using a
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
index 630de8a5a33..3db3b51e8b6 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -38,13 +38,12 @@ namespace tensorflow {
 
 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
 
-bool MfccMelFilterbank::Initialize(int input_length,
-                               double input_sample_rate,
-                               int output_channel_count,
-                               double lower_frequency_limit,
-                               double upper_frequency_limit) {
+bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
+                                   int output_channel_count,
+                                   double lower_frequency_limit,
+                                   double upper_frequency_limit) {
   num_channels_ = output_channel_count;
-  sample_rate_  = input_sample_rate;
+  sample_rate_ = input_sample_rate;
   input_length_ = input_length;
 
   if (num_channels_ < 1) {
@@ -85,10 +84,9 @@ bool MfccMelFilterbank::Initialize(int input_length,
   }
 
   // Always exclude DC; emulate HTK.
-  const double hz_per_sbin = 0.5 * sample_rate_ /
-      static_cast<double>(input_length_ - 1);
-  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit /
-                                           hz_per_sbin));
+  const double hz_per_sbin =
+      0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
   end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
 
   // Maps the input spectrum bin indices to filter bank channels/indices. For
@@ -121,12 +119,12 @@ bool MfccMelFilterbank::Initialize(int input_length,
       weights_[i] = 0.0;
     } else {
       if (channel >= 0) {
-        weights_[i] = (center_frequencies_[channel + 1] -
-                       FreqToMel(i * hz_per_sbin)) /
+        weights_[i] =
+            (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
             (center_frequencies_[channel + 1] - center_frequencies_[channel]);
       } else {
         weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
-            (center_frequencies_[0] - mel_low);
+                      (center_frequencies_[0] - mel_low);
       }
     }
   }
@@ -152,16 +150,16 @@ bool MfccMelFilterbank::Initialize(int input_length,
     }
   }
   if (!bad_channels.empty()) {
-    LOG(ERROR) << "Missing " << bad_channels.size() << " bands " <<
-        " starting at " << bad_channels[0] <<
-        " in mel-frequency design. " <<
-        "Perhaps too many channels or " <<
-        "not enough frequency resolution in spectrum. (" <<
-        "input_length: " << input_length <<
-        " input_sample_rate: " << input_sample_rate <<
-        " output_channel_count: " << output_channel_count <<
-        " lower_frequency_limit: " << lower_frequency_limit <<
-        " upper_frequency_limit: " << upper_frequency_limit;
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
+               << " starting at " << bad_channels[0]
+               << " in mel-frequency design. "
+               << "Perhaps too many channels or "
+               << "not enough frequency resolution in spectrum. ("
+               << "input_length: " << input_length
+               << " input_sample_rate: " << input_sample_rate
+               << " output_channel_count: " << output_channel_count
+               << " lower_frequency_limit: " << lower_frequency_limit
+               << " upper_frequency_limit: " << upper_frequency_limit;
   }
   initialized_ = true;
   return true;
@@ -171,7 +169,7 @@ bool MfccMelFilterbank::Initialize(int input_length,
 // square root, then summing FFT magnitudes under triangular integration windows
 // whose widths increase with frequency.
 void MfccMelFilterbank::Compute(const std::vector<double> &input,
-                            std::vector<double> *output) const {
+                                std::vector<double> *output) const {
   if (!initialized_) {
     LOG(ERROR) << "Mel Filterbank not initialized.";
     return;
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 1bdc2dc93b8..37c3936e80d 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -27,10 +27,8 @@ class MfccMelFilterbank {
  public:
   MfccMelFilterbank();
   bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
-                  double input_sample_rate,
-                  int output_channel_count,
-                  double lower_frequency_limit,
-                  double upper_frequency_limit);
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
 
   // Takes a squared-magnitude spectrogram slice as input, computes a
   // triangular-mel-weighted linear-magnitude filterbank, and places the result
@@ -56,7 +54,7 @@ class MfccMelFilterbank {
   // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
   std::vector<int> band_mapper_;
   int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
-  int end_index_;  // Highest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
 
   TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
 };
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
index 602dfeb4e54..54f31e1699e 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@@ -34,11 +34,9 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {
     input.push_back(i + 1);
   }
   const int kChannelCount = 20;
-  filterbank.Initialize(input.size(),
-                        22050 /* sample rate */,
-                        kChannelCount /* channels */,
-                        20.0 /*  lower frequency limit */,
-                        4000.0 /* upper frequency limit */);
+  filterbank.Initialize(
+      input.size(), 22050 /* sample rate */, kChannelCount /* channels */,
+      20.0 /*  lower frequency limit */, 4000.0 /* upper frequency limit */);
 
   std::vector<double> output;
   filterbank.Compute(input, &output);
@@ -65,13 +63,10 @@ TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) {
   std::vector<double> input;
   std::vector<double> output;
 
-  filterbank.Initialize(kSampleCount,
-                        22050 /* sample rate */,
-                        20 /* channels */,
-                        20.0 /*  lower frequency limit */,
+  filterbank.Initialize(kSampleCount, 22050 /* sample rate */,
+                        20 /* channels */, 20.0 /*  lower frequency limit */,
                         4000.0 /* upper frequency limit */);
 
-
   // First call with nonzero input value, and an empty output vector,
   // will resize the output and fill it with the correct, nonzero outputs.
   input.assign(kSampleCount, 1.0);
diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc
index cb32df8811e..72c1d331d6e 100644
--- a/tensorflow/core/kernels/mfcc_test.cc
+++ b/tensorflow/core/kernels/mfcc_test.cc
@@ -36,11 +36,10 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) {
   std::vector<double> output;
   mfcc.Compute(input, &output);
 
-  std::vector<double> expected = {29.13970072, -6.41568601, -0.61903012,
-                             -0.96778652, -0.26819878, -0.40907028,
-                             -0.15614748, -0.23203119, -0.10481487,
-                             -0.1543029,  -0.0769791,  -0.10806114,
-                             -0.06047613};
+  std::vector<double> expected = {
+      29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+      -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+      -0.0769791,  -0.10806114, -0.06047613};
 
   ASSERT_EQ(expected.size(), output.size());
   for (int i = 0; i < output.size(); ++i) {
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index fbdeaf43ebb..26e1082989f 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -87,8 +87,8 @@ class MirrorPadOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(context,
                     before <= in0.dim_size(d) && after <= in0.dim_size(d),
@@ -296,8 +296,8 @@ class MirrorPadGradOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("Paddings must be non-negative: ",
-                                          before, ", ", after));
+                  errors::InvalidArgument(
+                      "Paddings must be non-negative: ", before, ", ", after));
 
       const int64 out_size = in0.dim_size(d) - (before + after);
       if (offset_ == 0) {  // SYMMETRIC mode.
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 89d37d2f874..ef724f0a296 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 using mkldnn::sum;
@@ -37,7 +37,7 @@ using mkldnn::sum;
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
@@ -285,7 +285,7 @@ class MklAddNOp : public OpKernel {
   } MklAddNOpContext;
 };
 
-#else  // INTEL_MKL_DNN
+#else  // INTEL_MKL_ML
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -317,8 +317,11 @@ class MklAddNOp : public OpKernel {
                                                 : src2_tensor.dims();
       // if the shapes of two tensors are not same raise op error
       TensorShape src1_shape, src2_shape;
-      src1_shape = src1_tensor.shape();
-      src2_shape = src2_tensor.shape();
+      src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape()
+                  : src1_tensor.shape();
+      src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape()
+                  : src2_tensor.shape();
+
       if (!src1_shape.IsSameSize(src2_shape)) {
         ctx->SetStatus(errors::InvalidArgument(
             "Inputs to operation ", this->name(), " of type ",
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d751a70fc86..cff1bd18a74 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,24 +24,23 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-using mkldnn::memory;
-using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
-using mkldnn::padding_kind;
-using mkldnn::engine;
-using mkldnn::prop_kind;
 using mkldnn::algorithm;
+using mkldnn::engine;
+using mkldnn::error;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::prop_kind;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
@@ -358,10 +357,11 @@ class MklAvgPoolingGradOp : public OpKernel {
       if (!outbackprop_in_mkl_format) {
         // For avgpooling, tensor_in_shape should have 1 dimension, and 4
         // elements.
-        OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-                                 tensor_in_shape.NumElements() == 4,
-                    errors::InvalidArgument("original input shape must be "
-                                            "1-dimensional and 4 elements"));
+        OP_REQUIRES(
+            context,
+            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+            errors::InvalidArgument("original input shape must be "
+                                    "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
         OP_REQUIRES(context, out_backprop.dims() == 4,
@@ -429,13 +429,15 @@ class MklAvgPoolingGradOp : public OpKernel {
 };  // MklAvgPoolingGradOp
 
 
-#else  // INTEL_MKL_DNN is defined
+
+#else
+
 
 template <typename Device, typename T>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklAvgPoolingOp(OpKernelConstruction* context)
-  : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // Workspace is an MKLDNN construct that is only used in Max Pooling.
     // So set workspace_enabled_ to false.
     this->workspace_enabled_ = false;
@@ -444,8 +446,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-              this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -457,9 +459,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                          input_tensor, &pool_params,
-                          &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -467,59 +468,77 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
+      // If input is an empty tensor, allocate an empty output tensor and return
+      if (input_tensor.NumElements() == 0) {
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        TensorShape output_tf_shape;
+        if (pool_params.data_format == TensorFormat::FORMAT_NCHW) {
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
+        } else {
+          memory::dims output_dims_NHWC_order;
+          output_dims_NHWC_order = {pool_params.tensor_in_batch,
+                                    static_cast<int>(pool_params.out_height),
+                                    static_cast<int>(pool_params.out_width),
+                                    pool_params.out_depth};
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order);
+        }
+        const int kOutputIndex = 0;
+        AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor,
+                                    output_tf_shape, output_mkl_shape);
+        CHECK_NOTNULL(output_tensor);
+        return;
+      }
+
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to AvgPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                static_cast<memory::format>(dnn_data_input.GetUsrMemDesc()
-                    .data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
 
       } else {
-          dnn_data_output.SetUsrMem(output_dims_mkl_order,
-              this->data_format_mkldnn_);
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
       }
 
-        // describe the memory layout
+      // describe the memory layout
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
       // 3. create a pooling primitive descriptor
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              dnn_data_input.GetUsrMemDesc(),
-              dnn_data_output.GetUsrMemDesc(),
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc,
-                                                 cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       CHECK_NOTNULL(output_tensor);
 
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
-      this->PrepareAndExecuteNet(pool_prim_desc,
-                                &dnn_data_input,
-                                &dnn_data_output);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Operation received an exception:",
-                                         error_msg));
+      this->PrepareAndExecuteNet(pool_prim_desc, &dnn_data_input,
+                                 &dnn_data_output);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }  // Compute
-};  // MklAvgPoolingOp
+};   // MklAvgPoolingOp
 
 //-----------------------------------------------------------------------------
 
@@ -527,27 +546,23 @@ template <class Device, class T>
 class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
-      const Tensor& tensor_in_shape = MklGetInput(context,
-          kInputTensorIndexInputShape);
-      const Tensor& input_gradient_tensor = MklGetInput(context,
-          kInputTensorIndexInputGradient);
+      const Tensor& tensor_in_shape =
+          MklGetInput(context, kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor =
+          MklGetInput(context, kInputTensorIndexInputGradient);
       GetMklShape(context, kInputTensorIndexInputShape,
-            &original_input_mkl_shape);
+                  &original_input_mkl_shape);
       GetMklShape(context, kInputTensorIndexInputGradient,
-            &input_gradient_mkl_shape);
+                  &input_gradient_mkl_shape);
 
-
-      SanityCheckInputs(context, tensor_in_shape,
-                        input_gradient_tensor,
-                        original_input_mkl_shape,
-                        input_gradient_mkl_shape);
+      SanityCheckInputs(context, tensor_in_shape, input_gradient_tensor,
+                        original_input_mkl_shape, input_gradient_mkl_shape);
       if (!context->status().ok()) return;
 
       // Used to allocate output_diff_src/diff_src
@@ -562,90 +577,70 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolParameters pool_params;
       memory::dims output_dims_mkl_order, original_input_dims_nchw;
       // Configure the original input memory descriptor
-      memory::desc original_input_md = ConfigureOriginalInput(context,
-                                      tensor_in_shape,
-                                      original_input_mkl_shape,
-                                      &original_input_dims_nchw,
-                                      &pool_params,
-                                      &original_input_shape);
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, tensor_in_shape, original_input_mkl_shape,
+          &original_input_dims_nchw, &pool_params, &original_input_shape);
 
       // configure the original output memory descriptor
       // by definition, the shape of the original output is the same
       // as the shape of the gradient diff_dst
       memory::desc original_output_md = this->ConfigureOriginalOutput(
-                pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+          pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
 
       memory::desc target_diff_dst_md = this->ConfigureInputGradient(
-                                    input_gradient_mkl_shape,
-                                    input_gradient_tensor,
-                                    &input_gradient_diff_dst,
-                                    original_output_md);
+          input_gradient_mkl_shape, input_gradient_tensor,
+          &input_gradient_diff_dst, original_output_md);
       // The shape of the output diff src needs to be the same shape as the
       // original input. But we will set its format to be same as the format of
       // input gradient. We won't use format of original input since it will
       // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
       // the input rather than actual input).
-      output_diff_src.SetUsrMem(original_input_dims_nchw,
-                                static_cast<memory::format>(
-                                  target_diff_dst_md.data.format));
+      output_diff_src.SetUsrMem(
+          original_input_dims_nchw,
+          static_cast<memory::format>(target_diff_dst_md.data.format));
 
       // Create the forward pooling primitive descriptor so we can reference it
       // in the backward pooling primitive descriptor
-      auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-              algorithm::pooling_avg_exclude_padding,
-              original_input_md,
-              original_output_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_fwd_prim_desc
-              = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                  cpu_engine);
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          original_input_md, original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
 
       auto pool_bkwd_desc = pooling_backward::desc(
-              algorithm::pooling_avg_exclude_padding,
-              output_diff_src.GetUsrMemDesc(),
-              target_diff_dst_md,
-              memory::dims({  pool_params.row_stride,
-                              pool_params.col_stride}),
-              memory::dims({  pool_params.window_rows,
-                              pool_params.window_cols}),
-              memory::dims({  static_cast<int>(pool_params.pad_top),
-                              static_cast<int>(pool_params.pad_left)}),
-              memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                              static_cast<int>(pool_params.pad_right)}),
-              TFPaddingToMklDnnPadding(this->padding_));
-      auto pool_bkwd_prim_desc
-                = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                              cpu_engine,
-                                              pool_fwd_prim_desc);
-      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-                      original_input_dims_nchw,
-                      this->data_format_mkldnn_,
-                      &output_tensor_diff_src);
+          algorithm::pooling_avg_exclude_padding,
+          output_diff_src.GetUsrMemDesc(), target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+      this->AllocateOutputTensor(
+          context, pool_bkwd_prim_desc, original_input_dims_nchw,
+          this->data_format_mkldnn_, &output_tensor_diff_src);
 
       output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
 
-      this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                          &input_gradient_diff_dst,
-                          &output_diff_src,
-                          memory::primitive_desc(
-                              target_diff_dst_md,
-                              cpu_engine));
-    } catch (mkldnn::error &e) {
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &input_gradient_diff_dst, &output_diff_src,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine));
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                      ", message: " + string(e.message) +
-                      ", in file " + string(__FILE__) + ":" +
-                      std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                      errors::Aborted("Compute received an exception:",
-                                      error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
@@ -655,12 +650,11 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   const int kInputTensorIndexInputShape = 0;
   const int kInputTensorIndexInputGradient = 1;
 
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-        const Tensor& tensor_original_input_shape,
-        const MklDnnShape& original_input_mkl_shape,
-        memory::dims* original_input_dims_mkl_order,
-        MklPoolParameters* pool_params,
-        TensorShape* input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_mkl_order);
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(input_tensor_shape);
@@ -672,47 +666,47 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
     }
 
     return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                              context,
-                                              tensor_original_input_shape,
-                                              original_input_mkl_shape,
-                                              original_input_dims_mkl_order,
-                                              pool_params,
-                                              *input_tensor_shape);
-}
+        context, tensor_original_input_shape, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
 
   void SanityCheckInputs(OpKernelContext* context,
-                        const Tensor& tensor_in_shape,
-                        const Tensor& input_gradient_tensor,
-                        const MklDnnShape& original_input_mkl_shape,
-                        const MklDnnShape& input_gradient_mkl_shape) {
+                         const Tensor& tensor_in_shape,
+                         const Tensor& input_gradient_tensor,
+                         const MklDnnShape& original_input_mkl_shape,
+                         const MklDnnShape& input_gradient_mkl_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-          tensor_in_shape.NumElements() == 4,
+      OP_REQUIRES(
+          context,
+          tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
           errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+                                  "1-dimensional and 4 elements"));
     } else {
-      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
-          original_input_mkl_shape.DimSize(0) == 4,
-          errors::InvalidArgument("original input shape must be "
-                "1-dimensional and 4 elements"));
+      OP_REQUIRES(context,
+                  original_input_mkl_shape.GetDimension() == 1 &&
+                      original_input_mkl_shape.DimSize(0) == 4,
+                  errors::InvalidArgument("original input shape must be "
+                                          "1-dimensional and 4 elements"));
     }
 
     if (!input_gradient_mkl_shape.IsMklTensor()) {
       // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
       OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
-          errors::InvalidArgument("Gradient shape must be "
-                              "4-dimensional"));
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
     }
   }
 };  // MklAvgPoolingGradOp
 
 
 
-#endif  // INTEL_MKL_DNN
+
+#endif  // INTEL_MKL_ML
+
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
@@ -728,4 +722,3 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 9fee94f9465..d9713075be6 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define MKL_Complex8 tensorflow::complex64
 #define MKL_Complex16 tensorflow::complex128
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d109bb6bcfe..f1f267e849a 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -30,11 +30,11 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::concat;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -45,7 +45,6 @@ typedef std::vector<TensorShape> TensorShapeList;
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
-
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
 // reference inputs.
 // --------------------------------------------------------------------------
@@ -63,7 +62,7 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
@@ -152,8 +151,8 @@ class EigenConcatBaseOp : public OpKernel {
 
 #else  // MKL_DNN
 
-void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
-                        const TensorShapeList& input_shapes) {
+  void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+               const TensorShapeList& input_shapes) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
         AxisArgName == NAME_IS_AXIS
@@ -197,7 +196,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
       const auto in = values[i];
       const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
       OP_REQUIRES(
-          c, (input_shapes[i].dims() == input_dims) ||
+          c,
+          (input_shapes[i].dims() == input_dims) ||
               (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
@@ -208,8 +208,8 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
       }
-      output_concat_dim += input_shapes[i].dims() > 0 ?
-                           input_shapes[i].dim_size(axis) : 1;
+      output_concat_dim +=
+          input_shapes[i].dims() > 0 ? input_shapes[i].dim_size(axis) : 1;
     }
 
     TensorShape output_shape(input_shape);
@@ -230,7 +230,7 @@ void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
 #endif
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
@@ -418,7 +418,6 @@ class MklConcatOp : public OpKernel {
     OP_REQUIRES_OK(context, context->status());
   }
 
-
  private:
   typedef struct {
     TensorFormat data_format;
@@ -590,39 +589,45 @@ class MklConcatOp : public OpKernel {
       GetMklShapeList(context, "values", &input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
-                    ? MklGetInput(context, 0) : MklGetInput(context, N);
+                                            ? MklGetInput(context, 0)
+                                            : MklGetInput(context, N);
       // Sanity checks
-      OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor.shape().DebugString()));
-      int32 concat_dim = internal::SubtleMustCopy(
-                           concat_dim_tensor.scalar<int32>()());
+      OP_REQUIRES(
+          context, IsLegacyScalar(concat_dim_tensor.shape()),
+          errors::InvalidArgument(
+              "Concat dim tensor should be a scalar integer, but got shape ",
+              concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
 
       // check that ranks of all tensors match
       // and that their shapes match except for concat_dim.
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor() ?
-                                         input_shapes[0].GetTfShape() :
-                                         input_tensors[0].shape();
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
       for (auto& s : input_shapes) {
-        if (s == expected_shape) {++i; continue;}
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
 
-        TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                      input_tensors[i].shape();
+        TensorShape s_shape =
+            s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
 
-        OP_REQUIRES(context, s_dims == expected_dims,
-                  errors::InvalidArgument(
-                      "_MklConcatOp : Ranks of all input tensors should match:"
-                      " input dimensions = ",
-                      s_dims, " vs. expected rank = ", expected_dims));
+        OP_REQUIRES(
+            context, s_dims == expected_dims,
+            errors::InvalidArgument(
+                "_MklConcatOp : Ranks of all input tensors should match:"
+                " input dimensions = ",
+                s_dims, " vs. expected rank = ", expected_dims));
 
         for (int d = 0; d < expected_dims; ++d) {
           if (d == concat_dim) continue;
@@ -630,10 +635,11 @@ class MklConcatOp : public OpKernel {
           size_t expected_size = expected_shape.dim_size(d);
           size_t s_size = s_shape.dim_size(d);
           OP_REQUIRES(
-            context, expected_size == s_size,
-            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
-                    "should match: shape[0][", d, "]= ", expected_size,
-                    " vs. shape[", i, "][", d, "] = ", s_size));
+              context, expected_size == s_size,
+              errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                                      "should match: shape[0][",
+                                      d, "]= ", expected_size, " vs. shape[", i,
+                                      "][", d, "] = ", s_size));
         }
 
         if (s.IsMklTensor())
@@ -657,8 +663,8 @@ class MklConcatOp : public OpKernel {
         TensorShapeList tf_input_shapes;
         i = 0;
         for (auto& s : input_shapes) {
-          TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
-                                input_tensors[i].shape();
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
           tf_input_shapes.push_back(s_shape);
           ++i;
         }
@@ -678,21 +684,22 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k =0; k < N; k++) {
+      for (int k = 0; k < N; k++) {
         bool is_mkl_tensor = input_shapes[k].IsMklTensor();
         memory::dims src_dims;
 
         // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor) ?
-                   TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) :
-                   TFShapeToMklDnnDims(input_tensors[k].shape());
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
 
         dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-          // It does not matter what data format we use here (NHWC or NCHW).
-          // We just need to ensure that output of Concat uses same data format
-          // as input.
-                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
 
         srcs[k].SetUsrMem(src_md, &input_tensors[k]);
         auto src_mpd = srcs[k].GetUsrMemPrimDesc();
@@ -707,14 +714,15 @@ class MklConcatOp : public OpKernel {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
         auto orig_tf_format = input_shapes[0].GetTfDataFormat();
-        dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims,
-                               MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        dst_dims_in_nchw = MklDnnDimsInNCHW(
+            dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
         // We will set the output in the same format as input to avoid layout
         // conversions.
         // Currently we are setting dst format same as input format.
         // See if we can make this choice in a better way.
-        dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
-                 (memory::format) input_shapes[0].GetMklLayout().data.format);
+        dst_md = memory::desc(
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
       } else {
         // Again, format does not matter here. We just need to make it same as
         // input format.
@@ -722,7 +730,7 @@ class MklConcatOp : public OpKernel {
       }
 
       std::vector<primitive::at> inputs;
-      for (int k=0; k < input_tensors.size(); k++)
+      for (int k = 0; k < input_tensors.size(); k++)
         inputs.push_back(srcs[k].GetOpMem());
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
@@ -732,8 +740,7 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs)
-        concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -752,24 +759,25 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
       }
-      AllocateOutputSetMklShape(context, 0, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, 0, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
       CHECK_NOTNULL(dst_tensor);
 
-      dst_md = dnn_shape_dst.IsMklTensor() ?
-               dnn_shape_dst.GetMklLayout() : dst_md;
+      dst_md =
+          dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout() : dst_md;
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
       std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-               ", message: " + string(e.message) + ", in file " +
-               string(__FILE__) + ":" + std::to_string(__LINE__);
-        OP_REQUIRES_OK(context, errors::Aborted(
-                "Operation received an exception:", error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -790,11 +798,9 @@ class MklConcatOp : public OpKernel {
     dnn_shape_output.SetDimensions(4);
     Tensor* output_tensor = nullptr;
     TensorShape tf_shape_output;
-    tf_shape_output.AddDim(
-        dnn_shape_output.GetSerializeBufferSize());
-    context->allocate_output(
-        GetTensorMetaDataIndex(0, context->num_outputs()),
-        tf_shape_output, &output_tensor);
+    tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()),
+                             tf_shape_output, &output_tensor);
     dnn_shape_output.SerializeMklDnnShape(
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 0f1a218fe62..25c25737412 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -38,9 +38,9 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 54d4916d494..1401bc65a45 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,24 +38,24 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_weights;
 using mkldnn::memory;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropFilterOp : public OpKernel {
@@ -360,8 +360,8 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      void* mkl_buf_out_backprop = const_cast<void*>(static_cast<const void*>(
-                                      out_backprop.flat<T>().data()));
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
 
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
@@ -371,10 +371,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                      lt_out_backprop, mkl_lt_internal_out_backprop),
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-            lt_out_backprop, &mkl_buf_convert_out_backprop);
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
@@ -428,18 +429,18 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
 #else
 
 template <typename Device, class T, bool biasEnabled>
-class MklConv2DCustomBackpropFilterOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropFilterOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropFilterOp() {}
 
  private:
@@ -447,7 +448,7 @@ class MklConv2DCustomBackpropFilterOp :
                          const MklDnnShape& filter_mkl_shape,
                          const MklDnnShape& obp_mkl_shape) {
     CHECK(!filter_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropFilter: filter should not be in MKL Layout";
+        << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
@@ -462,8 +463,10 @@ class MklConv2DCustomBackpropFilterOp :
                                 const Tensor& filter_tensor) {
     TensorShape filter_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(
-             filter_tensor.vec<int32>(), &filter_tf_shape).ok(), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
+                                         &filter_tf_shape)
+                 .ok(),
+             true);
     return filter_tf_shape;
   }
 
@@ -485,16 +488,13 @@ class MklConv2DCustomBackpropFilterOp :
     return memory::format::hwio;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -508,34 +508,35 @@ class MklConv2DCustomBackpropFilterOp :
     int depth = 0;
     if (biasEnabled) {
       // Data structure for bias_grad
-      bias_grad = new MklDnnData<T> (&cpu_engine);
+      bias_grad = new MklDnnData<T>(&cpu_engine);
       TensorShape obp_tf_shape = GetTfShape(context, 2);
-      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat()
-                == FORMAT_NCHW) ?
-          obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat() ==
+               FORMAT_NCHW)
+                  ? obp_tf_shape.dim_size(1)
+                  : obp_tf_shape.dim_size(3);
       memory::dims bias_grad_dims = {depth};
       bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
     }
 
     // Create convolution backward weights primitive.
-    auto bwd_desc = (biasEnabled && (bias_grad != nullptr))?
-        convolution_backward_weights::desc(convolution_direct,
-                                input->GetOpMemDesc(), output->GetOpMemDesc(),
-                                bias_grad->GetOpMemDesc(),
-                                outbackprop->GetOpMemDesc(), strides, padding_l,
-                                padding_r, padding) :
-        convolution_backward_weights::desc(convolution_direct,
-                          input->GetOpMemDesc(), output->GetOpMemDesc(),
-                          outbackprop->GetOpMemDesc(), strides, padding_l,
-                          padding_r, padding);
+    auto bwd_desc =
+        (biasEnabled && (bias_grad != nullptr))
+            ? convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), bias_grad->GetOpMemDesc(),
+                  outbackprop->GetOpMemDesc(), strides, padding_l, padding_r,
+                  padding)
+            : convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides,
+                  padding_l, padding_r, padding);
 
-    auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
-                                                            cpu_engine,
-                                                            conv_fwd_pd);
+    auto bwd_pd = convolution_backward_weights::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
 
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
@@ -548,8 +549,8 @@ class MklConv2DCustomBackpropFilterOp :
       AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
       memory::dims bias_grad_dims = {depth};
       // Since Bias is 1D, we use format::x from MKLDNN to represent it.
-      auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType<T>(),
-                                       memory::format::x);
+      auto bias_grad_md =
+          memory::desc({bias_grad_dims}, MklDnnType<T>(), memory::format::x);
       bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
       bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
     }
@@ -562,28 +563,29 @@ class MklConv2DCustomBackpropFilterOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
 
-      // For BackpropFilter, we convert the output tensor back in Tensorflow
-      // layout. Because typically, BackpropFilter is the last operator in the
-      // graph that emit filter gradient that is provided to ApplyGradient
-      // method to update the filter. But it may be possible to eliminate this
-      // by forwarding filter in MKL layout if we support ApplyGradient method
-      // for MKL layout propagation.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(false);
-      // output_dims_mkl_order is in OIHW format.
-      // Allocate shape of TF tensor in HWIO format.
-      TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
-                                   output_dims_mkl_order[MklDnnDims::Dim_W],
-                                   output_dims_mkl_order[MklDnnDims::Dim_I],
-                                   output_dims_mkl_order[MklDnnDims::Dim_O]});
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout. Because typically, BackpropFilter is the last operator in the
+    // graph that emit filter gradient that is provided to ApplyGradient
+    // method to update the filter. But it may be possible to eliminate this
+    // by forwarding filter in MKL layout if we support ApplyGradient method
+    // for MKL layout propagation.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    // output_dims_mkl_order is in OIHW format.
+    // Allocate shape of TF tensor in HWIO format.
+    TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                 output_dims_mkl_order[MklDnnDims::Dim_W],
+                                 output_dims_mkl_order[MklDnnDims::Dim_I],
+                                 output_dims_mkl_order[MklDnnDims::Dim_O]});
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Allocate tensor for bias grad
@@ -600,9 +602,9 @@ class MklConv2DCustomBackpropFilterOp :
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  MklDnnData<T>* input, MklDnnData<T>* obp,
-                  MklDnnData<T>* output, MklDnnData<T>* bias_grad = nullptr) {
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output,
+      MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
@@ -612,15 +614,15 @@ class MklConv2DCustomBackpropFilterOp :
     // For BackpropFilter, we convert the output tensor back in Tensorflow
     // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_weights_primitive_desc());
+        conv_pd.diff_weights_primitive_desc());
 
     if (biasEnabled && (bias_grad != nullptr)) {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem(),
-                                      bias_grad->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem(),
+          bias_grad->GetOpMem()));
     } else {
-      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                      obp->GetOpMem(), output->GetOpMem()));
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
     }
 
     if (output_reorder_required) {
@@ -631,27 +633,29 @@ class MklConv2DCustomBackpropFilterOp :
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);\
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>); \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")  \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-              MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);             \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>);              \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index ef6db58d31f..eeed0095310 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -41,22 +43,20 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::convolution_backward_data;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropInputOp : public OpKernel {
@@ -359,16 +359,15 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp :
-  public MklConv2DBackpropCommonOp<Device, T> {
+class MklConv2DCustomBackpropInputOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
   ~MklConv2DCustomBackpropInputOp() {}
 
  private:
-  const int kInputIndex_Filter = 1,
-            kInputIndex_InputSizes = 0,
+  const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0,
             kInputIndex_OutBackProp = 2;
   void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
                          const MklDnnShape& filter_mkl_shape,
@@ -377,7 +376,7 @@ class MklConv2DCustomBackpropInputOp :
     // of the Tensor and never an actual tensor. So it will never be in MKL
     // layout.
     CHECK(!input_mkl_shape.IsMklTensor())
-      << "Conv2DBackpropInput: input should not be in MKL Layout";
+        << "Conv2DBackpropInput: input should not be in MKL Layout";
   }
 
   size_t GetInputTensorIndexWithSizes() { return kInputIndex_InputSizes; }
@@ -386,8 +385,10 @@ class MklConv2DCustomBackpropInputOp :
                                const Tensor& input_tensor) {
     TensorShape input_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
-    CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec<int32>(),
-                                         &input_tf_shape).ok(), true);
+    CHECK_EQ(
+        TensorShapeUtils::MakeShape(input_tensor.vec<int32>(), &input_tf_shape)
+            .ok(),
+        true);
     return input_tf_shape;
   }
 
@@ -414,16 +415,13 @@ class MklConv2DCustomBackpropInputOp :
     return data_format;
   }
 
-  void CreatePrimitive(OpKernelContext* context,
-                       const engine& cpu_engine,
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
                        const convolution_forward::primitive_desc& conv_fwd_pd,
                        MklDnnData<T>* input, MklDnnData<T>* filter,
                        MklDnnData<T>* outbackprop, MklDnnData<T>* output,
-                       Tensor** output_tensor,
-                       const memory::dims& strides,
+                       Tensor** output_tensor, const memory::dims& strides,
                        const memory::dims& padding_l,
-                       const memory::dims& padding_r,
-                       padding_kind padding,
+                       const memory::dims& padding_r, padding_kind padding,
                        const memory::dims& bwd_output_dims,
                        memory::format bwd_output_format) {
     CHECK_NOTNULL(context);
@@ -434,19 +432,16 @@ class MklConv2DCustomBackpropInputOp :
     CHECK_NOTNULL(output_tensor);
 
     // Create convolution backward data primitive.
-    auto bwd_desc = convolution_backward_data::desc(convolution_direct,
-                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
-                      outbackprop->GetOpMemDesc(), strides, padding_l,
-                      padding_r, padding);
-
-    auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
-                                                          cpu_engine,
-                                                          conv_fwd_pd);
+    auto bwd_desc = convolution_backward_data::desc(
+        convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(),
+        outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding);
 
+    auto bwd_pd = convolution_backward_data::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
 
     // Allocate output tensor in TensorFlow and MKL layout.
-    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
-                         bwd_output_format, output_tensor);
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
     CHECK_NOTNULL(*output_tensor);
     // Set buffer handle using allocated output tensor.
     output->SetUsrMemDataHandle(*output_tensor);
@@ -455,50 +450,50 @@ class MklConv2DCustomBackpropInputOp :
   }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(OpKernelContext* context,
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_data::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
 
-      // Output primitive descriptor for backward data is diff_src.
-      auto dst_pd = conv_pd.diff_src_primitive_desc();
+    // Output primitive descriptor for backward data is diff_src.
+    auto dst_pd = conv_pd.diff_src_primitive_desc();
 
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                                   output_dims_mkl_order, output_tf_format);
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
 
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
 
-      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
-                                output_mkl_shape);
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  MklDnnData<T>* filter, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+      const convolution_backward_data::primitive_desc& conv_pd,
+      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+    net.push_back(convolution_backward_data(
+        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
 
     stream(stream::kind::eager).submit(net).wait();
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_CPU_KERNELS(T)                                 \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 0e77b45993c..cbda12689f8 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,14 +42,16 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 
 
-#ifdef INTEL_MKL_DNN
+
+#ifndef INTEL_MKL_ML
+
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -59,8 +61,8 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
@@ -116,18 +118,19 @@ class MklConv2DOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
                             : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, input_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, input_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter.dim_size(2)));
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
 
@@ -136,9 +139,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_rows_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
                             : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
@@ -147,9 +151,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_cols_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
                             : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -157,9 +162,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_batch_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
                             : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(input_batch_raw);
 
     // For now we take the stride from the second and third dimensions only (we
@@ -313,8 +319,7 @@ class MklConv2DOp : public OpKernel {
     // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
         mkl_tmp_bias_buf_tensor;
-    mkl_context.MklPrepareConvolutionInputs(context,
-                                            &mkl_tmp_input_buf_tensor,
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
                                             &mkl_tmp_bias_buf_tensor);
 
@@ -398,8 +403,9 @@ class MklConv2DOp : public OpKernel {
       mkl_convert_input =
           !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                 lt_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -517,8 +523,8 @@ class MklConv2DOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-            errors::InvalidArgument("Filter should not be in "
-            "Mkl Layout"));
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
@@ -531,11 +537,10 @@ class MklConv2DOp : public OpKernel {
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
-      conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape,
-                                         &src_dims, &filter_dims, &strides,
-                                         &output_dims_tf_order,
-                                         &output_dims_mkl_order, &padding_l,
-                                         &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -543,21 +548,20 @@ class MklConv2DOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* output_tensor = nullptr;
-      if (output_tf_shape.num_elements() == 0 ||
-          output_dims_tf_order[0] == 0) {
+      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
-                                    src_tf_shape, output_mkl_shape);
+                                  src_tf_shape, output_mkl_shape);
 
         // MklConv2D also outputs converted filter as 2nd output of Conv2D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
         AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor,
-                                  filter_tf_shape, filter_mkl_shape);
+                                  &output_filter_tensor, filter_tf_shape,
+                                  filter_mkl_shape);
         return;
       }
 
@@ -570,14 +574,15 @@ class MklConv2DOp : public OpKernel {
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout (NHWC or NCHW depending on data format).
       auto src_md = src_mkl_shape.IsMklTensor()
-                    ? src_mkl_shape.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
       src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
-                    ? filter_mkl_shape.GetMklLayout()
-          : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio);
+                           ? filter_mkl_shape.GetMklLayout()
+                           : memory::desc(filter_dims, MklDnnType<T>(),
+                                          memory::format::hwio);
       filter.SetUsrMem(filter_md, &filter_tensor);
 
       // Set output shape (output_dims) required in MKL-DNN order.
@@ -601,34 +606,13 @@ class MklConv2DOp : public OpKernel {
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            bias.GetOpMemDesc(), output.GetOpMemDesc(), strides,
-            padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
 
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
-        AllocateOutputTensor(context, conv_prim_desc,
-                             output_dims_mkl_order, tf_fmt, &output_tensor);
-        // Set data handle for output.
-        output.SetUsrMemDataHandle(output_tensor);
-
-        Tensor* filter_out_tensor = nullptr;
-        AllocateFilterOutputTensor(context, conv_prim_desc,
-                        TFShapeToMklDnnDims(filter_tf_shape),
-                        &filter_out_tensor);
-
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            &bias, &output, filter_out_tensor);
-      } else {
-        // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            output.GetOpMemDesc(), strides, padding_l, padding_r,
-            TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
         AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
                              tf_fmt, &output_tensor);
         // Set data handle for output.
@@ -636,18 +620,39 @@ class MklConv2DOp : public OpKernel {
 
         Tensor* filter_out_tensor = nullptr;
         AllocateFilterOutputTensor(context, conv_prim_desc,
-                TFShapeToMklDnnDims(filter_tf_shape),
-                &filter_out_tensor);
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter,
-                            nullptr, &output, filter_out_tensor);
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
+                             filter_out_tensor);
+      } else {
+        // Create convolution primitive without Bias.
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
+            padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
+
+        Tensor* filter_out_tensor = nullptr;
+        AllocateFilterOutputTensor(context, conv_prim_desc,
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output,
+                             filter_out_tensor);
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -655,71 +660,67 @@ class MklConv2DOp : public OpKernel {
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
-  const int kInputIndex_Src = 0,
-            kInputIndex_Filter = 1,
-            kInputIndex_Bias = 2;
+  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
 
   // Allocate output tensor.
   void AllocateOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& output_dims_mkl_order,
-                  memory::format output_tf_format, Tensor** output_tensor) {
-      CHECK_NOTNULL(output_tensor);
-      auto dst_pd = conv_prim_desc.dst_primitive_desc();
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    auto dst_pd = conv_prim_desc.dst_primitive_desc();
 
-      // Allocate shape of Mkl tensor.
-      MklDnnShape output_mkl_shape;
-      output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_pd);
-      output_mkl_shape.SetElemType(MklDnnType<T>());
-      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-              output_dims_mkl_order, output_tf_format);
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
 
-      // Allocate shape of TF tensor.
-      TensorShape output_tf_shape;
-      output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
 
-      AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
-                                output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   // Allocate output tensor.
   void AllocateFilterOutputTensor(
-                  OpKernelContext* context,
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  const memory::dims& filter_dims_tf_order,
-                  Tensor** filter_tensor) {
-      CHECK_NOTNULL(filter_tensor);
-      auto filter_pd = conv_prim_desc.weights_primitive_desc();
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) {
+    CHECK_NOTNULL(filter_tensor);
+    auto filter_pd = conv_prim_desc.weights_primitive_desc();
 
-      // Allocate shape of Mkl tensor.
-      MklDnnShape filter_mkl_shape;
-      filter_mkl_shape.SetMklTensor(true);
-      filter_mkl_shape.SetMklLayout(&filter_pd);
-      filter_mkl_shape.SetElemType(MklDnnType<T>());
+    // Allocate shape of Mkl tensor.
+    MklDnnShape filter_mkl_shape;
+    filter_mkl_shape.SetMklTensor(true);
+    filter_mkl_shape.SetMklLayout(&filter_pd);
+    filter_mkl_shape.SetElemType(MklDnnType<T>());
 
-      // The format of the filter is actually OIhw8i8o, but TF doesn't support
-      // this format. Just use format::blocked for now because the layout
-      // is stored in the MKL data.
-      filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
-                  filter_dims_tf_order, memory::format::blocked);
+    // The format of the filter is actually OIhw8i8o, but TF doesn't support
+    // this format. Just use format::blocked for now because the layout
+    // is stored in the MKL data.
+    filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
+                                 filter_dims_tf_order, memory::format::blocked);
 
-      // Allocate the data space for the filter to propagate as TF tensor.
-      TensorShape filter_tf_shape;
-      filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+    // Allocate the data space for the filter to propagate as TF tensor.
+    TensorShape filter_tf_shape;
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
 
-      AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
-              filter_tf_shape, filter_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
+                              filter_tf_shape, filter_mkl_shape);
   }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* filter,
-                  MklDnnData<T>* bias, MklDnnData<T>* output,
-                  Tensor* filter_out_tensor) {
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
+      MklDnnData<T>* output, Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -731,18 +732,20 @@ class MklConv2DOp : public OpKernel {
     // rather than re-order to a temp buffer, reorder directly to the
     // filter output tensor
     filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(),
-                    filter->GetTensorBuffer(filter_out_tensor), &net);
+                                filter->GetTensorBuffer(filter_out_tensor),
+                                &net);
 
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), bias->GetOpMem(),
-                                    output->GetOpMem()));
+                                        filter->GetOpMem(), bias->GetOpMem(),
+                                        output->GetOpMem()));
     } else {
       CHECK_EQ(biasEnabled, false);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+                                        filter->GetOpMem(),
+                                        output->GetOpMem()));
     }
 
     stream(stream::kind::eager).submit(net).wait();
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index c6456bd5c33..9dd88221a84 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
-#include <vector>
 #include <limits>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,8 +27,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -40,19 +40,19 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #endif
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 class MklDnnConvUtil {
  protected:
@@ -63,13 +63,13 @@ class MklDnnConvUtil {
 
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
-                 Padding pad, TensorFormat fm) : context_(context),
-    strides_(strides), padding_(pad), data_format_(fm) {}
+                 Padding pad, TensorFormat fm)
+      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
   // Calculate Convolution strides
-  virtual inline void GetStridesInMklOrder(memory::dims *strides) {
+  virtual inline void GetStridesInMklOrder(memory::dims* strides) {
     // For now we take the stride from the second and third dimensions only
     // (we do not support striding on the batch or depth dimension).
     CHECK_NOTNULL(strides);
@@ -82,14 +82,14 @@ class MklDnnConvUtil {
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetInputSizeInMklOrder(const TensorShape& input_shape,
-                         memory::dims *input_dims) {
-  #define CHECK_BOUNDS(val, err_msg) do {                     \
-    OP_REQUIRES(context_, FastBoundsCheck(val,                \
-                            std::numeric_limits<int>::max()), \
-                errors::InvalidArgument(err_msg));            \
-  }while(0)
+  virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
+                                             memory::dims* input_dims) {
+#define CHECK_BOUNDS(val, err_msg)                                     \
+  do {                                                                 \
+    OP_REQUIRES(context_,                                              \
+                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));                     \
+  } while (0)
 
     CHECK_NOTNULL(input_dims);
 
@@ -112,7 +112,7 @@ class MklDnnConvUtil {
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
-  #undef CHECK_BOUNDS
+#undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
     std::vector<int> mkldnn_sizes(4, -1);
@@ -138,10 +138,9 @@ class MklDnnConvUtil {
   // forward gets actual tensor as input).
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetFilterSizeInMklOrder(const TensorShape& input_shape,
-                          const TensorShape& filter_shape,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                                              const TensorShape& filter_shape,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == 4,
@@ -149,17 +148,18 @@ class MklDnnConvUtil {
                                         filter_shape.DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context_,
+                  FastBoundsCheck(filter_shape.dim_size(i),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(
-        context_, input_depth == filter_shape.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter_shape.dim_size(2)));
+    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter_shape.dim_size(2)));
 
     // TF filter is always in (rows, cols, in_depth, out_depth) order.
     int filter_rows = static_cast<int>(filter_shape.dim_size(0));
@@ -182,25 +182,24 @@ class MklDnnConvUtil {
   // requires filter in OIHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
+                                              size_t filter_index,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
     GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
-                            GetTfShape(context_, filter_index),
-                            filter_dims);
+                            GetTfShape(context_, filter_index), filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
   // anything, but sets error in context status.
-  virtual inline void
-  GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) {
+  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
+                                            memory::dims* bias_dims) {
     const Tensor& bias = MklGetInput(context_, bias_index);
     OP_REQUIRES(context_, bias.dims() == 1,
                 errors::InvalidArgument("bias must be 1-dimensional: ",
                                         bias.shape().DebugString()));
 
-    *bias_dims = { static_cast<int>(bias.dim_size(0)) };
+    *bias_dims = {static_cast<int>(bias.dim_size(0))};
   }
 
   // Function to calculate output and padding size for 2D convolution.
@@ -212,13 +211,11 @@ class MklDnnConvUtil {
   // status is returned via context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape,
-                                const TensorShape& filter_shape,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  virtual inline void GetOutputAndPadSizeInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      const memory::dims& strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -244,16 +241,16 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows,
-                                 padding_, &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols,
-                                 padding_, &out_cols, &pad_left, &pad_right));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_rows, filter_rows, stride_rows, padding_,
+                                 &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_cols, filter_cols, stride_cols, padding_,
+                                 &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape = ShapeFromFormat(data_format_, out_batch,
-                                            out_rows, out_cols, out_depth);
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
@@ -273,12 +270,10 @@ class MklDnnConvUtil {
   // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
   //
   // Function does not return anything, but sets error in context status.
-  inline void
-  GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -289,11 +284,11 @@ class MklDnnConvUtil {
 
     OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                          input_tf_shape.DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
-                                  strides, output_dims_tf_order,
-                                  output_dims_mkl_order, pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  output_dims_tf_order, output_dims_mkl_order,
+                                  pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -302,15 +297,12 @@ class MklDnnConvUtil {
   // also calculates strides and paddings for 2D Convolution.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape,
-                                        const TensorShape& filter_shape,
-                                        memory::dims *input_dims,
-                                        memory::dims *filter_dims,
-                                        memory::dims *strides,
-                                        memory::dims *output_dims_tf_order,
-                                        memory::dims *output_dims_mkl_order,
-                                        memory::dims *pad_l,
-                                        memory::dims *pad_r) {
+  inline void GetConvFwdSizesInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      memory::dims* input_dims, memory::dims* filter_dims,
+      memory::dims* strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -325,8 +317,7 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
-                                  output_dims_tf_order,
-                                  output_dims_mkl_order,
+                                  output_dims_tf_order, output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
   }
@@ -337,7 +328,7 @@ class MklDnnConvUtil {
 /////////////////////////////////////////////////////////////////////
 
 template <typename Device, class T>
-class MklConv2DBackpropCommonOp :  public OpKernel {
+class MklConv2DBackpropCommonOp : public OpKernel {
  public:
   ~MklConv2DBackpropCommonOp() {}
   explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
@@ -397,12 +388,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
           outbprop_tf_shape.num_elements() == 0) {
         MklDnnShape output_mkl_shape;
         output_mkl_shape.SetMklTensor(false);
-        TensorShape output_tf_shape = GetOutputTfShape(input_tf_shape,
-                                                       filter_tf_shape,
-                                                       outbprop_tf_shape);
+        TensorShape output_tf_shape = GetOutputTfShape(
+            input_tf_shape, filter_tf_shape, outbprop_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor,
-                                    output_tf_shape, output_mkl_shape);
+                                  output_tf_shape, output_mkl_shape);
         CHECK_NOTNULL(output_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -421,12 +411,10 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims,
+          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
@@ -437,20 +425,22 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
       // construct input TF layout. For TF layout, although input shape
       // required is in MKL-DNN order, the layout is Tensorflow's layout
       // (NHWC or NCHW depending on data format).
-      auto fwd_input_md = input_mkl_shape.IsMklTensor() ?
-                          input_mkl_shape.GetMklLayout() :
-                       memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_input_md =
+          input_mkl_shape.IsMklTensor()
+              ? input_mkl_shape.GetMklLayout()
+              : memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
       // If filter is in MKL layout, then simply grab filter layout; otherwise
       // construct filter in TF layout. For TF layout, filter is in HWIO format.
-      auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ?
-                          filter_mkl_shape.GetMklLayout() :
-                          memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                       memory::format::hwio);
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
+                               ? filter_mkl_shape.GetMklLayout()
+                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                              memory::format::hwio);
       // Tensorflow Output of Conv2D is in data_format order.
       auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+      auto fwd_desc = convolution_forward::desc(
+          prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md,
+          fwd_out_md, strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
       auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
 
       // Create memory for user data. Describe how the inputs and outputs of
@@ -495,17 +485,16 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
       // Operator-specific call to create and execute primitive.
       CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
-                      &outbackprop, &output, &output_tensor,
-                      strides, padding_l, padding_r,
-                      TFPaddingToMklDnnPadding(padding_),
+                      &outbackprop, &output, &output_tensor, strides, padding_l,
+                      padding_r, TFPaddingToMklDnnPadding(padding_),
                       bwd_output_dims, bwd_output_format);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -523,11 +512,11 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get TensorFlow shape of input tensor.
   virtual TensorShape MakeInputTfShape(OpKernelContext* context,
-                                      const Tensor& input_tensor) = 0;
+                                       const Tensor& input_tensor) = 0;
 
   /// Get TensorFlow shape of filter tensor.
   virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
-                                       const Tensor& filter_tensor) = 0;
+                                        const Tensor& filter_tensor) = 0;
 
   /// Get the TensorFlow shape of output tensor.
   virtual TensorShape GetOutputTfShape(const TensorShape& input_shape,
@@ -536,9 +525,9 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
 
   /// Get shape of output in MKL-DNN order. Computes shape of output from
   /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
-  virtual
-  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
-                                    const memory::dims& fwd_filter_dims) = 0;
+  virtual const memory::dims& GetOutputDims(
+      const memory::dims& fwd_input_dims,
+      const memory::dims& fwd_filter_dims) = 0;
 
   /// Get data_format of output in MKL-DNN order. If output data format is
   /// same as input data format, then it simply returns value of data_format
@@ -546,24 +535,25 @@ class MklConv2DBackpropCommonOp :  public OpKernel {
   virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
 
   /// Create and execute the primitive storing output in the output_tensor.
-  virtual void CreatePrimitive(OpKernelContext* context,
-    const engine& cpu_engine,
-    const convolution_forward::primitive_desc& conv_fwd_pd,
-    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
-    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
-    const memory::dims& padding_l, const memory::dims& padding_r,
-    padding_kind padding, const memory::dims& bwd_output_dims,
-    memory::format bwd_output_format) = 0;
+  virtual void CreatePrimitive(
+      OpKernelContext* context, const engine& cpu_engine,
+      const convolution_forward::primitive_desc& conv_fwd_pd,
+      MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+      MklDnnData<T>* output, Tensor** output_tensor,
+      const memory::dims& strides, const memory::dims& padding_l,
+      const memory::dims& padding_r, padding_kind padding,
+      const memory::dims& bwd_output_dims,
+      memory::format bwd_output_format) = 0;
 
   // Get the data_format {NCHW, NHWC}
-  TensorFormat GetTFDataFormat () { return data_format_; }
+  TensorFormat GetTFDataFormat() { return data_format_; }
 
  private:
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
@@ -575,12 +565,12 @@ class MklDummyOp : public OpKernel {
  public:
   ~MklDummyOp() {}
 
-  explicit MklDummyOp(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    TF_CHECK_OK(errors::Unimplemented("This is a dummy op."
-                                      "It should not have been invoked."));
+    TF_CHECK_OK(
+        errors::Unimplemented("This is a dummy op."
+                              "It should not have been invoked."));
   }
 };
 
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index c065724e0db..58f0c30f32b 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -1,4 +1,4 @@
-﻿/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0(the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 8340a91d059..8313224d7fe 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,15 +25,15 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
-using mkldnn::use_scale_shift;
-using mkldnn::use_global_stats;
-using mkldnn::batch_normalization_forward;
 using mkldnn::batch_normalization_backward;
+using mkldnn::batch_normalization_forward;
+using mkldnn::prop_kind;
+using mkldnn::stream;
+using mkldnn::use_global_stats;
+using mkldnn::use_scale_shift;
 #endif
 
 // TODO(inteltf) Address comments from PR 8968.
@@ -41,7 +41,7 @@ using mkldnn::batch_normalization_backward;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
@@ -601,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
       mkl_res_batchnorm_bwd[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
 
-     bool mkl_convert_out_backprop;
+      bool mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
       dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
       void* mkl_buf_converted_out_backprop = nullptr;
@@ -683,7 +683,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
 };
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
@@ -709,12 +709,11 @@ class MklFusedBatchNormOp : public OpKernel {
       const size_t kMeanIndex = 3;      // index of est_mean tensor
       const size_t kVarianceIndex = 4;  // index of est_variance tensor
 
-      const Tensor& src_tensor          = MklGetInput(context, kSrcIndex);
-      const Tensor& scale_tensor        = MklGetInput(context, kScaleIndex);
-      const Tensor& shift_tensor        = MklGetInput(context, kShiftIndex);
-      const Tensor& est_mean_tensor     = MklGetInput(context, kMeanIndex);
-      const Tensor& est_variance_tensor = MklGetInput(context,
-                                                      kVarianceIndex);
+      const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
+      const Tensor& shift_tensor = MklGetInput(context, kShiftIndex);
+      const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex);
+      const Tensor& est_variance_tensor = MklGetInput(context, kVarianceIndex);
 
       TensorShape tf_shape_src;
       MklDnnShape dnn_shape_src;
@@ -723,37 +722,34 @@ class MklFusedBatchNormOp : public OpKernel {
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
       OP_REQUIRES(context, shift_tensor.dims() == 1,
                   errors::InvalidArgument("offset must be 1-dimensional",
-                                        shift_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_mean must be 1-dimensional",
-                      est_mean_tensor.shape().DebugString()));
-      OP_REQUIRES(context, est_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "estimated_variance must be 1-dimensional",
-                      est_variance_tensor.shape().DebugString()));
+                                          shift_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_mean_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                  est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_variance_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                  est_variance_tensor.shape().DebugString()));
 
       if (is_training_) {
-        OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0,
-                    errors::InvalidArgument(
-                        "estimated_mean must be empty for training",
-                        est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, est_mean_tensor.dim_size(0) == 0,
+            errors::InvalidArgument("estimated_mean must be empty for training",
+                                    est_mean_tensor.shape().DebugString()));
         OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
                     errors::InvalidArgument(
                         "estimated_variance must be empty for training",
@@ -763,11 +759,9 @@ class MklFusedBatchNormOp : public OpKernel {
       // special case: input with 0 element and 0 batch size
       Tensor* dst_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0) {
-         HandleEmptyInput(context,
-                          tf_shape_src,
-                          scale_tensor.shape(),
-                          &dst_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &dst_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -783,11 +777,8 @@ class MklFusedBatchNormOp : public OpKernel {
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
-      AllocateTFOutputs(context,
-                        scale_tensor.shape(),
-                        &batch_mean_tensor,
-                        &batch_variance_tensor,
-                        &saved_mean_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
+                        &batch_variance_tensor, &saved_mean_tensor,
                         &saved_variance_tensor);
 
       if (is_training_)
@@ -815,69 +806,63 @@ class MklFusedBatchNormOp : public OpKernel {
         src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
                                              tensor_format_);
       } else {
-        src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
-                                             tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
       }
 
       auto src_md = dnn_shape_src.IsMklTensor()
-                    ? dnn_shape_src.GetMklLayout()
-                    : memory::desc(src_dims, MklDnnType<T>(), format_m);
+                        ? dnn_shape_src.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), format_m);
       src.SetUsrMem(src_md, &src_tensor);
 
       // set weights primitive
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
-      T* weights_data = reinterpret_cast<T*>(
-                        weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(scale_tensor.flat<T>().data()));
-      T* shift_tf = reinterpret_cast<T*>(
-                    const_cast<T*>(shift_tensor.flat<T>().data()));
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf =
+          reinterpret_cast<T*>(const_cast<T*>(shift_tensor.flat<T>().data()));
 
-      for (int k=0; k < depth_; k++) {
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = shift_tf[k];
       }
 
       // set mean primitive
-      auto mean_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto mean_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
-      char* saved_mean_data_tf = reinterpret_cast<char*>
-                                 (saved_mean_tensor->flat<T>().data());
-      std::memcpy(saved_mean_data_tf,
-                  reinterpret_cast<char*>(mean_values_),
-                  depth_*sizeof(T));
-      auto mean_m = memory(mean_pd,
-                           reinterpret_cast<void*>(saved_mean_data_tf));
+      char* saved_mean_data_tf =
+          reinterpret_cast<char*>(saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf, reinterpret_cast<char*>(mean_values_),
+                  depth_ * sizeof(T));
+      auto mean_m =
+          memory(mean_pd, reinterpret_cast<void*>(saved_mean_data_tf));
 
       // set variance primitive
-      auto variance_desc = memory::desc({1, depth_},
-                                    MklDnnType<T>(),
-                                    memory::format::nc);
+      auto variance_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
       auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
-      char* saved_variance_data_tf = reinterpret_cast<char*>
-                  (saved_variance_tensor->flat<T>().data());
+      char* saved_variance_data_tf =
+          reinterpret_cast<char*>(saved_variance_tensor->flat<T>().data());
       std::memcpy(saved_variance_data_tf,
                   reinterpret_cast<char*>(variance_values_),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
       auto variance_m = memory(variance_pd, saved_variance_data_tf);
 
-      prop_kind pk = (is_training_) ?
-                     prop_kind::forward_training :
-                     prop_kind::forward_scoring;
+      prop_kind pk = (is_training_) ? prop_kind::forward_training
+                                    : prop_kind::forward_scoring;
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               pk, src.GetUsrMemDesc(), epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                             bnrm_fwd_desc, cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -887,47 +872,39 @@ class MklFusedBatchNormOp : public OpKernel {
         auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
-        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
-                                  src_dims, format_m);
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                  format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor,
-                                tf_shape_dst, dnn_shape_dst);
+      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
 
       // Output of batchnorm has same shape as input.
       dst.SetUsrMem(src_md, dst_tensor);
 
       primitive bnrm_fwd_op;
       if (is_training_) {
-        bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          weights_m,
-                          dst.GetOpMem(),
-                          mean_m,
-                          variance_m);
+        bnrm_fwd_op =
+            batch_normalization_forward(bnrm_fwd_pd, src.GetOpMem(), weights_m,
+                                        dst.GetOpMem(), mean_m, variance_m);
       } else {
         bnrm_fwd_op = batch_normalization_forward(
-                          bnrm_fwd_pd,
-                          src.GetOpMem(),
-                          mean_m,
-                          variance_m,
-                          (const primitive::at) weights_m,
-                          dst.GetOpMem());
+            bnrm_fwd_pd, src.GetOpMem(), mean_m, variance_m,
+            (const primitive::at)weights_m, dst.GetOpMem());
       }
       std::vector<primitive> net;
       net.push_back(bnrm_fwd_op);
       stream(stream::kind::eager).submit(net).wait();
 
       // copy batch_mean data
-      T* batch_mean_data_tf = reinterpret_cast<T*>(
-                                batch_mean_tensor->flat<T>().data());
+      T* batch_mean_data_tf =
+          reinterpret_cast<T*>(batch_mean_tensor->flat<T>().data());
       std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
                   reinterpret_cast<char*>(mean_m.get_data_handle()),
-                  depth_*sizeof(T));
+                  depth_ * sizeof(T));
 
       // copy batch_variance data with Bessel's correction
       // if training mode is on
@@ -937,18 +914,17 @@ class MklFusedBatchNormOp : public OpKernel {
         size_t adjust_size = orig_size - 1;
         adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
       }
-      for (int k=0; k < depth_; k++)
+      for (int k = 0; k < depth_; k++)
         batch_variance_tensor->flat<T>().data()[k] =
-            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
-            * adjust_factor;
-    } catch (mkldnn::error &e) {
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k] *
+            adjust_factor;
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -958,7 +934,7 @@ class MklFusedBatchNormOp : public OpKernel {
   bool is_training_;
   T* mean_values_;
   T* variance_values_;
-  size_t depth_;          // batch normalization is done for per channel.
+  size_t depth_;  // batch normalization is done for per channel.
 
   void ExtractParams(OpKernelContext* context) {
     const Tensor& input = MklGetInput(context, 0);
@@ -966,23 +942,20 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
-    mean_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(mean.flat<T>().data()));
-    variance_values_ = reinterpret_cast<T*>(
-                       const_cast<T*>(variance.flat<T>().data()));
+    mean_values_ = reinterpret_cast<T*>(const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ =
+        reinterpret_cast<T*>(const_cast<T*>(variance.flat<T>().data()));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
-                        TensorShape tf_shape_scale,
-                        Tensor** dst_tensor) {
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale, Tensor** dst_tensor) {
     CHECK_NOTNULL(dst_tensor);
 
     const size_t kDstIndex = 0;
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor,
-                              tf_shape_src, dnn_shape_dst);
+    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src,
+                              dnn_shape_dst);
     CHECK_NOTNULL(*dst_tensor);
     memset(const_cast<char*>((*dst_tensor)->tensor_data().data()), 0,
            (*dst_tensor)->tensor_data().size());
@@ -991,15 +964,12 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* batch_variance_tensor = nullptr;
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale,
-                      &batch_mean_tensor,
-                      &batch_variance_tensor,
-                      &saved_mean_tensor,
+    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
+                      &batch_variance_tensor, &saved_mean_tensor,
                       &saved_variance_tensor);
   }
 
-  void AllocateTFOutputs(OpKernelContext* context,
-                         TensorShape tf_shape_scale,
+  void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1017,51 +987,43 @@ class MklFusedBatchNormOp : public OpKernel {
     // allocate batch mean output tensor
     MklDnnShape mkl_shape_batch_mean;
     mkl_shape_batch_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchMeanIndex,
-                              batch_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_batch_mean);
+    AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor,
+                              tf_shape_scale, mkl_shape_batch_mean);
     CHECK_NOTNULL(*batch_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_mean_tensor)->flat<T>().data()[k] = NAN;
 
     // allocate batch variance output tensor
     MklDnnShape mkl_shape_batch_variance;
     mkl_shape_batch_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context,
-                              kBatchVarianceIndex,
-                              batch_variance_tensor,
-                              tf_shape_scale,
+    AllocateOutputSetMklShape(context, kBatchVarianceIndex,
+                              batch_variance_tensor, tf_shape_scale,
                               mkl_shape_batch_variance);
     CHECK_NOTNULL(*batch_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*batch_variance_tensor)->flat<T>().data()[k] = NAN;
 
     // Mean and variance (without Bessel's correction) saved for backward
     // computation to serve as pre-computed mean and variance.
     MklDnnShape mkl_shape_saved_mean;
     mkl_shape_saved_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kSavedMeanIndex,
-                              saved_mean_tensor,
-                              tf_shape_scale,
-                              mkl_shape_saved_mean);
+    AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor,
+                              tf_shape_scale, mkl_shape_saved_mean);
     CHECK_NOTNULL(*saved_mean_tensor);
     // set NAN mean value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_mean_tensor)->flat<T>().data()[k] = NAN;
 
     MklDnnShape mkl_shape_saved_variance;
     mkl_shape_saved_variance.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kSavedVarianceIndex,
-                              saved_variance_tensor,
-                              tf_shape_scale,
+                              saved_variance_tensor, tf_shape_scale,
                               mkl_shape_saved_variance);
     CHECK_NOTNULL(*saved_variance_tensor);
     // set NAN variance value in case of empty input tensor
-    for (int k=0; k < tf_shape_scale.num_elements(); k++)
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
       (*saved_variance_tensor)->flat<T>().data()[k] = NAN;
   }
 };
@@ -1093,8 +1055,8 @@ class MklFusedBatchNormGradOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
       const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
       const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex);
-      const Tensor& saved_variance_tensor = MklGetInput(context,
-                                            kVarianceIndex);
+      const Tensor& saved_variance_tensor =
+          MklGetInput(context, kVarianceIndex);
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, kSrcIndex, &dnn_shape_src);
@@ -1103,53 +1065,49 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       if (dnn_shape_diff_dst.IsMklTensor()) {
         tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape();
-        OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, dnn_shape_diff_dst.GetDimension() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       } else {
         tf_shape_diff_dst = diff_dst_tensor.shape();
-        OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        diff_dst_tensor.shape().DebugString()));
+        OP_REQUIRES(
+            context, diff_dst_tensor.dims() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
       }
 
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
         OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                         src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       } else {
         tf_shape_src = src_tensor.shape();
         OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument(
-                        "input must be 4-dimensional",
-                        src_tensor.shape().DebugString()));
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
       }
 
       OP_REQUIRES(context, scale_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "scale must be 1-dimensional",
-                      scale_tensor.shape().DebugString()));
-      OP_REQUIRES(context, saved_mean_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved mean must be 1-dimensional",
-                       saved_mean_tensor.shape().DebugString()));
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, saved_mean_tensor.dims() == 1,
+          errors::InvalidArgument("saved mean must be 1-dimensional",
+                                  saved_mean_tensor.shape().DebugString()));
 
-      OP_REQUIRES(context, saved_variance_tensor.dims() == 1,
-                  errors::InvalidArgument(
-                      "saved variance must be 1-dimensional",
-                      saved_variance_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, saved_variance_tensor.dims() == 1,
+          errors::InvalidArgument("saved variance must be 1-dimensional",
+                                  saved_variance_tensor.shape().DebugString()));
 
       Tensor* diff_src_tensor = nullptr;
       if (tf_shape_src.num_elements() == 0 ||
           tf_shape_diff_dst.num_elements() == 0) {
-         HandleEmptyInput(context, tf_shape_src,
-                          scale_tensor.shape(),
-                          &diff_src_tensor);
-         return;
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &diff_src_tensor);
+        return;
       }
 
       if (dnn_shape_src.IsMklTensor())
@@ -1175,20 +1133,18 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       memory::dims src_dims, diff_dst_dims;
       if (dnn_shape_src.IsMklTensor())
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       dnn_shape_src.GetTfShape(), tensor_format_);
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
       else
-        src_dims = TFShapeToMklDnnDimsInNCHW(
-                       src_tensor.shape(), tensor_format_);
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
 
       if (dnn_shape_diff_dst.IsMklTensor())
         diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            dnn_shape_diff_dst.GetTfShape(),
-                            tensor_format_);
+            dnn_shape_diff_dst.GetTfShape(), tensor_format_);
       else
-        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
-                            diff_dst_tensor.shape(),
-                            tensor_format_);
+        diff_dst_dims =
+            TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_);
 
       // set src and diff_dst primitives
       memory::desc src_md({}, memory::data_undef, memory::format_undef);
@@ -1202,7 +1158,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           src_md = diff_dst_md;
         }
       } else {
-        src_md =  memory::desc(src_dims, MklDnnType<T>(), format_m);
+        src_md = memory::desc(src_dims, MklDnnType<T>(), format_m);
         diff_dst_md = src_md;
       }
       src.SetUsrMem(src_md, &src_tensor);
@@ -1210,55 +1166,47 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       // weights -- DNN packs scales/shifts as weights in order of
       // scale, ..., scale, shift, ..., shift
-      auto weights_desc = memory::desc({2, depth_},
-                                       MklDnnType<T>(),
-                                       memory::format::nc);
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
       auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
       auto weights_m = memory(weights_pd);
       T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
-      T* scale_tf = reinterpret_cast<T*>(const_cast<T*>
-                                        (scale_tensor.flat<T>().data()));
-      for (int k=0; k < depth_; k++) {
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      for (int k = 0; k < depth_; k++) {
         weights_data[k] = scale_tf[k];
         weights_data[k + depth_] = 0;
       }
 
       // set mean primitive
       memory::dims mv_dims = GetMeanVarianceDims();
-      mean.SetUsrMem(mv_dims,
-                     memory::format::nc,
-                     const_cast<void*>(static_cast<const void*>
-                     (saved_mean_tensor.flat<T>().data())));
+      mean.SetUsrMem(mv_dims, memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>(
+                         saved_mean_tensor.flat<T>().data())));
       mean.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set variance primitive
-      variance.SetUsrMem(mv_dims,  memory::format::nc,
-                         const_cast<void*>(static_cast<const void*>
-                         (saved_variance_tensor.flat<T>().data())));
+      variance.SetUsrMem(mv_dims, memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>(
+                             saved_variance_tensor.flat<T>().data())));
       variance.SetOpMemDesc(mv_dims, memory::format::nc);
 
       // set diff_weight primitive
-      auto diff_weights_desc = memory::desc(
-                                 {2, depth_},
-                                 MklDnnType<T>(),
-                                 memory::format::nc);
-      auto diff_weights_pd = memory::primitive_desc(
-                                diff_weights_desc,
-                                cpu_engine);
+      auto diff_weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto diff_weights_pd =
+          memory::primitive_desc(diff_weights_desc, cpu_engine);
       auto diff_weights_m = memory(diff_weights_pd);
 
       auto bnrm_fwd_desc = batch_normalization_forward::desc(
-                               prop_kind::forward_training,
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          prop_kind::forward_training, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
-                                bnrm_fwd_desc,
-                                cpu_engine);
+          bnrm_fwd_desc, cpu_engine);
 
       // Indices of output tensors
-      const size_t kDiffSrcIndex = 0;    // index of diff_src tensor
+      const size_t kDiffSrcIndex = 0;  // index of diff_src tensor
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -1268,14 +1216,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
         auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
         dnn_shape_diff_src.SetElemType(MklDnnType<T>());
-        dnn_shape_diff_src.SetTfLayout(
-                              dnn_shape_src.GetDimension(),
-                              src_dims,
-                              format_m);
-        dnn_shape_diff_src.SetTfDimOrder(
-                              dnn_shape_src.GetDimension(),
-                              tensor_format_);
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                       format_m);
+        dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(),
+                                         tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
@@ -1287,33 +1232,22 @@ class MklFusedBatchNormGradOp : public OpKernel {
 
       prop_kind pk = prop_kind::backward;
       auto bnrm_bwd_desc = batch_normalization_backward::desc(
-                               pk,
-                               diff_src.GetUsrMemDesc(),
-                               src.GetUsrMemDesc(),
-                               epsilon_,
-                               /* for inference, specify use_global_stats
-                                  1. on fwd prop, use mean and variance
-                                     provided as inputs
-                                  2. on bwd prop, mean and variance are
-                                     considered as constants. Thus, 
-                                     reduce the amout of MKL computations
-                               */
-                               is_training_ ? use_scale_shift :
-                               (use_scale_shift | use_global_stats));
+          pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_,
+          /* for inference, specify use_global_stats
+             1. on fwd prop, use mean and variance
+                provided as inputs
+             2. on bwd prop, mean and variance are
+                considered as constants. Thus,
+                reduce the amout of MKL computations
+          */
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
       auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
-                               bnrm_bwd_desc,
-                               cpu_engine,
-                               bnrm_fwd_pd);
+          bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd);
 
       auto bnrm_bwd_op = batch_normalization_backward(
-                               bnrm_bwd_pd,
-                               src.GetOpMem(),
-                               mean.GetOpMem(),
-                               variance.GetOpMem(),
-                               diff_dst.GetOpMem(),
-                               weights_m,
-                               diff_src.GetOpMem(),
-                               diff_weights_m);
+          bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(),
+          diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m);
 
       std::vector<primitive> net;
       net.push_back(bnrm_bwd_op);
@@ -1322,43 +1256,39 @@ class MklFusedBatchNormGradOp : public OpKernel {
       // allocate 4 output TF tensors
       Tensor* diff_scale_tensor = nullptr;
       Tensor* diff_shift_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(),
-                        &diff_scale_tensor,
+      AllocateTFOutputs(context, scale_tensor.shape(), &diff_scale_tensor,
                         &diff_shift_tensor);
 
       // copy data: diff_scale and diff_shift
-      T* diff_weights_data_dnn = reinterpret_cast<T*>
-                                 (diff_weights_m.get_data_handle());
+      T* diff_weights_data_dnn =
+          reinterpret_cast<T*>(diff_weights_m.get_data_handle());
       for (int i = 0; i < depth_; i++) {
-        diff_scale_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i];
+        diff_scale_tensor->flat<T>().data()[i] = diff_weights_data_dnn[i];
         diff_shift_tensor->flat<T>().data()[i] =
-                              diff_weights_data_dnn[i + depth_];
+            diff_weights_data_dnn[i + depth_];
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
   T epsilon_;
   TensorFormat tensor_format_;
-  int depth_;             // batch normalization is done for per channel.
+  int depth_;  // batch normalization is done for per channel.
   bool is_training_;
 
   void ExtractParams(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
   }
 
-  void HandleEmptyInput(OpKernelContext* context,
-                        TensorShape tf_shape_src,
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
                         TensorShape tf_shape_scale_shift,
                         Tensor** diff_src_tensor) {
     const size_t kDiffSrcIndex = 0;
@@ -1366,22 +1296,20 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor,
-                                tf_shape_src, dnn_shape_diff_src);
-    for (size_t i=0; i < (*diff_src_tensor)->shape().num_elements(); i++)
-       (*diff_src_tensor)->flat<T>().data()[i] = 0;
+                              tf_shape_src, dnn_shape_diff_src);
+    for (size_t i = 0; i < (*diff_src_tensor)->shape().num_elements(); i++)
+      (*diff_src_tensor)->flat<T>().data()[i] = 0;
 
     Tensor* diff_scale_tensor = nullptr;
     Tensor* diff_shift_tensor = nullptr;
-    AllocateTFOutputs(context,
-                      tf_shape_scale_shift,
-                      &diff_scale_tensor,
+    AllocateTFOutputs(context, tf_shape_scale_shift, &diff_scale_tensor,
                       &diff_shift_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context,
-                        TensorShape tf_shape_scale_shift,
-                        Tensor** diff_scale_tensor,
-                        Tensor** diff_shift_tensor) {
+                         TensorShape tf_shape_scale_shift,
+                         Tensor** diff_scale_tensor,
+                         Tensor** diff_shift_tensor) {
     CHECK_NOTNULL(diff_scale_tensor);
     CHECK_NOTNULL(diff_shift_tensor);
 
@@ -1396,31 +1324,29 @@ class MklFusedBatchNormGradOp : public OpKernel {
     AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_scale);
     CHECK_NOTNULL(*diff_scale_tensor);
-    for (size_t i=0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
-       (*diff_scale_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
+      (*diff_scale_tensor)->flat<T>().data()[i] = 0;
 
     MklDnnShape mkl_shape_diff_shift;
     mkl_shape_diff_shift.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor,
                               tf_shape_scale_shift, mkl_shape_diff_shift);
     CHECK_NOTNULL(*diff_shift_tensor);
-    for (size_t i=0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
-       (*diff_shift_tensor)->flat<T>().data()[i] = 0;
+    for (size_t i = 0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
+      (*diff_shift_tensor)->flat<T>().data()[i] = 0;
 
     // Placeholders for estimated_mean and estimated_variance, which are
     // used for inference and thus not needed here for gradient computation.
-    Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+    Tensor *p1_tensor = nullptr, *p2_tensor = nullptr;
     MklDnnShape mkl_shape_p;
     mkl_shape_p.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor,
-                              TensorShape({}), mkl_shape_p);
-    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor,
-                              TensorShape({}), mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}),
+                              mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}),
+                              mkl_shape_p);
   }
 
-  memory::dims GetMeanVarianceDims() {
-    return memory::dims({1, depth_});
-  }
+  memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index 9ee27ee21c8..6c027f8e728 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,14 +28,14 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 #endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 4b5f7b83100..acb0db57b38 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::stream;
@@ -59,7 +59,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 //     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -271,8 +271,8 @@ class MklInputConversionOp : public OpKernel {
     MklDnnShape input_shape_1;
     GetMklShape(context, 1, &input_shape_1);
 
-    bool tf_shapes_are_same = context->input(0).shape() ==
-                              context->input(1).shape();
+    bool tf_shapes_are_same =
+        context->input(0).shape() == context->input(1).shape();
 
     VLOG(1) << "MklInputConversionOp: Input shapes are "
             << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
@@ -293,14 +293,56 @@ class MklInputConversionOp : public OpKernel {
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      // If both have the same shape, pass them through
       if (tf_shapes_are_same) {
-        VLOG(1) << "MklInputConversionOp: No conversion needed, "
-                << "copying MKL inputs with identical shapes to output";
+        auto input0_md = input_shape_0.GetMklLayout();
+        auto input1_md = input_shape_1.GetMklLayout();
+        
+        // If both have the same shape and same format, pass them through
+        if ( input0_md.data.format == input1_md.data.format) {
+          VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                  << "copying MKL inputs with identical shapes to output";
 
-        ForwardMklTensorInToOut(context, 0, 0);
-        ForwardMklTensorInToOut(context, 1, 1);
-        return;
+          ForwardMklTensorInToOut(context, 0, 0);
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;
+        } else {
+          VLOG(1) << "MklInputConversionOp: Shape is same, but format is different, "
+                  << "need to convert to same format";
+          
+          // Convert input0, and keep input1 unchanged
+          // Create MklDnnShape for output mkl tensor based on input0
+          Tensor* tensor_out;
+          MklDnnShape mkl_output_mkl_shape;
+          mkl_output_mkl_shape.SetMklTensor(true);
+          mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+          mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(),
+                                           input_shape_0.GetSizesAsMklDnnDims(),
+                                           input_shape_0.GetTfDataFormat());
+
+          // Get MKL layout from input1 as destination layout
+          mkl_output_mkl_shape.SetMklLayout(&input1_md);
+
+          // Create output Mkl tensor for index 0
+          AllocateOutputSetMklShape(context, 0, &tensor_out,
+                                    input_tensor_0.shape(), mkl_output_mkl_shape);
+
+          // Create MklDnnData object for input0 tesnsor
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> input(&cpu_engine);
+          input.SetUsrMem(input0_md, &input_tensor_0);
+
+          // Create reorder from input0's layout to input1's layout
+          std::vector<primitive> net;
+          CHECK_EQ(input.CheckReorderToOpMem(memory::primitive_desc(
+                                             input1_md, cpu_engine),
+                                             tensor_out, &net),
+                                             true);
+          stream(stream::kind::eager).submit(net).wait();
+
+          // Input1 will be passed through
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;         
+        }
       }
 
       // Sanity check
@@ -400,9 +442,9 @@ class MklInputConversionOp : public OpKernel {
 
       // Create reorder between tensorflow layout and Mkl layout.
       std::vector<primitive> net;
-      CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc(
-                                            output_mkl_md, cpu_engine),
-                                            tensor_out, &net),
+      CHECK_EQ(tf_input.CheckReorderToOpMem(
+                   memory::primitive_desc(output_mkl_md, cpu_engine),
+                   tensor_out, &net),
                true);
       stream(stream::kind::eager).submit(net).wait();
 
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 95e0404ba8a..5f0a12a1fb9 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,20 +33,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-using mkldnn::lrn_forward;
-using mkldnn::lrn_backward;
-using mkldnn::prop_kind;
 using mkldnn::lrn_across_channels;
+using mkldnn::lrn_backward;
+using mkldnn::lrn_forward;
+using mkldnn::prop_kind;
 using mkldnn::stream;
 #endif
 
@@ -67,7 +67,7 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename T>
 class MklLRNOp : public OpKernel {
@@ -77,10 +77,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -103,9 +104,10 @@ class MklLRNOp : public OpKernel {
                               : input.dims();
     OP_REQUIRES(context, mkl_context.in_dims == 4,
                 errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
 
     if (!input_in_mkl_format) {
       mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
@@ -339,17 +341,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -740,10 +742,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -773,10 +776,10 @@ class MklLRNOp : public OpKernel {
       if (!src_dnn_shape.IsMklTensor()) {
         MklDefaultToEigen(context, src_tensor);
         return;
-      } else if (!src_dnn_shape.IsMklChannelDim(
-                  src_dnn_shape.GetDimension() - 1) ) {
+      } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() -
+                                                1)) {
         Tensor converted_tensor =
-          ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
+            ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
         MklDefaultToEigen(context, converted_tensor);
         return;
       }
@@ -807,18 +810,16 @@ class MklLRNOp : public OpKernel {
       // Create LRN primitive descriptor.
       // Tensorflow's normalization semantics is across channels.
       // MKL-DNN also supports normalization within channel.
-      auto lrn_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
+      auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels,
                                         src_dnn_data.GetUsrMemDesc(),
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
+                                        kernel_size, new_alpha, beta_, bias_);
       auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine);
 
       // Allocate output_dnn_data tensor.
       Tensor* output_tensor = nullptr;
       memory::format input_format = src_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_prim_desc, input_dims,
-                    input_format, &output_tensor);
+      AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format,
+                           &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       dst_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -827,25 +828,23 @@ class MklLRNOp : public OpKernel {
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
       OP_REQUIRES_OK(context, context->status());
 
-      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data,
-                          &dst_dnn_data, &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data,
+                           &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
-  void PrepareAndExecuteNet(
-                const lrn_forward::primitive_desc& lrn_fwd_desc,
-                MklDnnData<T>* src_dnn_data,
-                MklDnnData<T>* dst_dnn_data,
-                MklDnnData<uint8>* wksp_dnn_data = nullptr) {
+  void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc,
+                            MklDnnData<T>* src_dnn_data,
+                            MklDnnData<T>* dst_dnn_data,
+                            MklDnnData<uint8>* wksp_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reorder
@@ -853,23 +852,21 @@ class MklLRNOp : public OpKernel {
 
     // Create pooling primitive and add it to net
     if (wksp_dnn_data != nullptr) {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-                        src_dnn_data->GetOpMem(),
-                        wksp_dnn_data->GetOpMem(),
-                        dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                wksp_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     } else {
-        net.push_back(lrn_forward(lrn_fwd_desc,
-            src_dnn_data->GetOpMem(),
-            dst_dnn_data->GetOpMem()));
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc();
 
@@ -880,111 +877,106 @@ class MklLRNOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
     // only allocate enough space for the elements we need.
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
-    }
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
 
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context, const Tensor& input) {
+    const int batch = static_cast<int>(input.dim_size(0));
+    const int rows = static_cast<int>(input.dim_size(1));
+    const int cols = static_cast<int>(input.dim_size(2));
+    const int depth = static_cast<int>(input.dim_size(3));
+    const int nodes = cols * rows;
 
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+    auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+    // Multiplying the input with the band matrix has the effect of reducing
+    // the
+    // correct patch along the depth.
+    Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+    GetBandMatrix<T>(depth, depth_radius_, &multiplier);
 
-      Tensor *output_dnn_data = nullptr;
-      MklDnnShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
-                                input.shape(), mkl_output_mkl_shape);
-      CHECK_NOTNULL(output_dnn_data);
+    Tensor* output_dnn_data = nullptr;
+    MklDnnShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input.shape(), mkl_output_mkl_shape);
+    CHECK_NOTNULL(output_dnn_data);
 
-      Tensor* workspace_tensor = nullptr;
-      MklDnnShape workspace_mkl_shape;
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(0);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+    Tensor* workspace_tensor = nullptr;
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(0);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
+    CHECK_NOTNULL(workspace_tensor);
 
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+    auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+    if (beta_ == T(1)) {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * tmp.inverse();
+    } else if (beta_ == T(0.5)) {
+      out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
+    } else {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * (tmp.log() * -beta_).exp();
     }
+  }
 
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-      CHECK_NOTNULL(dnn_data_wksp);
-      Tensor* workspace_tensor = nullptr;
-      memory::primitive_desc workspace_pd
-                  = lrn_fwd_prim_desc.workspace_primitive_desc();
-      size_t workspace_bytes = workspace_pd.get_size();
-      MklDnnShape workspace_mkl_shape;
-      // the workspace tensor is a uint8 tensor that has
-      // exactly the number of bytes necessary
-      workspace_mkl_shape.SetMklTensor(false);
-      TensorShape workspace_tf_shape;
-      workspace_tf_shape.AddDim(workspace_bytes);
-      AllocateOutputSetMklShape(context, kIdxWorkspace,
-                              &workspace_tensor,
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        lrn_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    // the workspace tensor is a uint8 tensor that has
+    // exactly the number of bytes necessary
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-      CHECK_NOTNULL(workspace_tensor);
-      dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& src_tensor = MklGetInput(context, kIdxInput);
     MklDnnShape src_dnn_shape;
     GetMklShape(context, kIdxInput, &src_dnn_shape);
     if (src_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     } else {
-        OP_REQUIRES(context, src_tensor.dims() == 4,
-                    errors::InvalidArgument("input must be 4-dimensional"));
-        OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
-                                            std::numeric_limits<int>::max()),
-                        errors::InvalidArgument("argument to LRN too large"));
+      OP_REQUIRES(context, src_tensor.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
     }
   }
-  const int kIdxInput = 0,
-            kIdxOutput = 0,
-            kIdxWorkspace = 1;
+  const int kIdxInput = 0, kIdxOutput = 0, kIdxWorkspace = 1;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -994,17 +986,17 @@ class MklLRNOp : public OpKernel {
   float beta_;
 };
 
-
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -1025,7 +1017,7 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T> output_dnn_data(&cpu_engine);
 
       MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
+          orig_output_dnn_shape;
       GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
       GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
       GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
@@ -1037,16 +1029,16 @@ class MklLRNGradOp : public OpKernel {
                             orig_input_dnn_shape.IsMklTensor() &&
                             orig_output_dnn_shape.IsMklTensor() &&
                             input_grad_dnn_shape.IsMklChannelDim(
-                              input_grad_dnn_shape.GetDimension() - 1) &&
+                                input_grad_dnn_shape.GetDimension() - 1) &&
                             orig_input_dnn_shape.IsMklChannelDim(
-                              orig_input_dnn_shape.GetDimension() - 1) &&
+                                orig_input_dnn_shape.GetDimension() - 1) &&
                             orig_output_dnn_shape.IsMklChannelDim(
-                              orig_output_dnn_shape.GetDimension() - 1);
+                                orig_output_dnn_shape.GetDimension() - 1);
 
       if (!can_use_mkldnn) {
-          // Fallback to eigen
-          MklDefaultToEigen(context);
-          return;
+        // Fallback to eigen
+        MklDefaultToEigen(context);
+        return;
       }
       // At this point, we have the all clear to use MklDnn constructs
       // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor.
@@ -1059,13 +1051,11 @@ class MklLRNGradOp : public OpKernel {
       // NHWC format.
       memory::desc original_output_md = orig_output_dnn_shape.GetCurLayout();
       memory::desc target_diff_dst_md = ConfigureInputGradient(
-                                input_grad_tensor,
-                                input_grad_dnn_shape,
-                                &input_grad_dnn_data);
+          input_grad_tensor, input_grad_dnn_shape, &input_grad_dnn_data);
 
       memory::desc orig_input_md = orig_input_dnn_shape.GetCurLayout();
       memory::dims orig_input_dims =
-                orig_input_dnn_shape.GetSizesAsMklDnnDims();
+          orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
       orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
 
@@ -1079,27 +1069,21 @@ class MklLRNGradOp : public OpKernel {
 
       // Create LRN backward primitive descriptor. It requires LRN forward
       // primitive descriptor also.
-      auto lrn_fwd_desc = lrn_forward::desc(prop_kind::forward,
-                                        lrn_across_channels,
-                                        orig_input_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_fwd_prim_desc = lrn_forward::primitive_desc(lrn_fwd_desc,
-                                                           cpu_engine);
-      auto lrn_bwd_desc = lrn_backward::desc(lrn_across_channels,
-                                        original_output_md,
-                                        target_diff_dst_md,
-                                        kernel_size,
-                                        new_alpha, beta_, bias_);
-      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(lrn_bwd_desc,
-                                                          cpu_engine,
-                                                          lrn_fwd_prim_desc);
+      auto lrn_fwd_desc = lrn_forward::desc(
+          prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size,
+          new_alpha, beta_, bias_);
+      auto lrn_fwd_prim_desc =
+          lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine);
+      auto lrn_bwd_desc = lrn_backward::desc(
+          lrn_across_channels, original_output_md, target_diff_dst_md,
+          kernel_size, new_alpha, beta_, bias_);
+      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
+          lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc);
 
       Tensor* output_tensor = nullptr;
-      memory::format orig_input_format
-                    = orig_input_dnn_shape.GetTfDataFormat();
-      AllocateOutputTensor(context, lrn_bwd_prim_desc,
-            orig_input_dims, orig_input_format, &output_tensor);
+      memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat();
+      AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims,
+                           orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       CHECK_NOTNULL(output_tensor);
       output_dnn_data.SetUsrMemDataHandle(output_tensor);
@@ -1110,35 +1094,32 @@ class MklLRNGradOp : public OpKernel {
       const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
       MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
       ConfigureWorkspace(workspace_tensor,
-                          lrn_fwd_prim_desc.workspace_primitive_desc(),
-                          &workspace_dnn_data);
+                         lrn_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
 
-      PrepareAndExecuteNet(lrn_bwd_prim_desc,
-                lrn_fwd_prim_desc,
-                &orig_input_dnn_data,
-                &input_grad_dnn_data,
-                &output_dnn_data,
-                memory::primitive_desc(target_diff_dst_md, cpu_engine),
-                &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
+      PrepareAndExecuteNet(
+          lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data,
+          &input_grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                     error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = lrn_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        lrn_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
 
     // We assume that all outputs at this point are MKL Tensors
@@ -1146,170 +1127,153 @@ class MklLRNGradOp : public OpKernel {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     size_t num_bytes = dst_pd.get_size();
     CHECK_EQ(num_bytes % sizeof(T), 0);
     output_tf_shape.AddDim(num_bytes / sizeof(T));
-    AllocateOutputSetMklShape(context, kIdxOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
   }
 
   memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor,
-                    const MklDnnShape& input_grad_dnn_shape,
-                    MklDnnData<T> *input_grad_dnn_data) {
+                                      const MklDnnShape& input_grad_dnn_shape,
+                                      MklDnnData<T>* input_grad_dnn_data) {
     CHECK_NOTNULL(input_grad_dnn_data);
     // This shouldn't be necessary at this point, but just in case
     CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true);
 
     memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
-    memory::dims orig_input_dims =
-              input_grad_dnn_shape.GetSizesAsMklDnnDims();
+    memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
     input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
     input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc);
     return input_grad_md;
   }
 
   void PrepareAndExecuteNet(
-        const lrn_backward::primitive_desc& lrn_bkwd_desc,
-        const lrn_forward::primitive_desc& lrn_fwd_desc,
-        MklDnnData<T>* src_dnn_data,
-        MklDnnData<T>* input_gradient_diff_dst,
-        MklDnnData<T>* output_diff_src,
-        const memory::primitive_desc& target_diff_dst_pd,
-        const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
+      const lrn_backward::primitive_desc& lrn_bkwd_desc,
+      const lrn_forward::primitive_desc& lrn_fwd_desc,
+      MklDnnData<T>* src_dnn_data, MklDnnData<T>* input_gradient_diff_dst,
+      MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
     std::vector<primitive> net;
 
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
-                lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
+        lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
 
     // Check for input reordering on the original input
-    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(),
-                                      &net);
+    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net);
     // Create pooling primitive and add it to net
     if (nullptr == workspace_dnn_data) {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                              src_dnn_data->GetOpMem(),
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     } else {
-      net.push_back(lrn_backward(lrn_bkwd_desc,
-                                src_dnn_data->GetOpMem(),
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace_dnn_data->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 workspace_dnn_data->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
   void ConfigureWorkspace(const Tensor& workspace_tensor,
-                    memory::primitive_desc workspace_pd,
-                    MklDnnData<uint8> *workspace_dnn_data) {
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
     CHECK_NOTNULL(workspace_dnn_data);
 
     workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
   }
 
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor input_gradient_tensor;
-      Tensor orig_input_tensor;
-      Tensor orig_output_tensor;
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context) {
+    Tensor input_gradient_tensor;
+    Tensor orig_input_tensor;
+    Tensor orig_output_tensor;
 
-      MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
-                  orig_output_dnn_shape;
-      GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
-      GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
-      GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
+    MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
+        orig_output_dnn_shape;
+    GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
+    GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
+    GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
 
-      if (input_grad_dnn_shape.IsMklTensor()) {
-        input_gradient_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxGradient),
-                              input_grad_dnn_shape);
-      } else {
-        input_gradient_tensor = MklGetInput(context, kIdxGradient);
-      }
+    if (input_grad_dnn_shape.IsMklTensor()) {
+      input_gradient_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape);
+    } else {
+      input_gradient_tensor = MklGetInput(context, kIdxGradient);
+    }
 
-      if (orig_input_dnn_shape.IsMklTensor()) {
-        orig_input_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigInput),
-                              orig_input_dnn_shape);
-      } else {
-        orig_input_tensor = MklGetInput(context, kIdxOrigInput);
-      }
+    if (orig_input_dnn_shape.IsMklTensor()) {
+      orig_input_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape);
+    } else {
+      orig_input_tensor = MklGetInput(context, kIdxOrigInput);
+    }
 
-      if (orig_output_dnn_shape.IsMklTensor()) {
-        orig_output_tensor =
-            ConvertMklToTF<T>(context,
-                              MklGetInput(context, kIdxOrigOutput),
-                              orig_output_dnn_shape);
-      } else {
-        orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
-      }
+    if (orig_output_dnn_shape.IsMklTensor()) {
+      orig_output_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape);
+    } else {
+      orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
+    }
 
-      const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
-      const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
-      const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
-      const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
-      const auto nodes = cols * rows;
+    const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
+    const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
+    const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
+    const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
+    const auto nodes = cols * rows;
 
-      auto grads_shaped =
-          input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto grads_shaped =
+        input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
-      auto activations =
-              orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto activations = orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
 
-      Tensor* output_dnn_data;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, kIdxOutput,
-                      &output_dnn_data,
-                      input_gradient_tensor.shape(),
-                      mkl_output_mkl_shape);
+    Tensor* output_dnn_data;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input_gradient_tensor.shape(),
+                              mkl_output_mkl_shape);
 
-      auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    out_shaped.setZero();
+    auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                  depth](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        for (int64 j = 0; j < depth; ++j) {
+          int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+          int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
 
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
+          T norm(0);
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            norm += in_shaped(i, k) * in_shaped(i, k);
+          }
+          norm = alpha_ * norm + bias_;
+          DCHECK_GT(norm, T(1e-6));
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+                    activations(i, j) / norm;
+            if (k == j) {
+              dyi += Eigen::numext::pow(norm, -beta_);
             }
+            dyi *= grads_shaped(i, j);
+            const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi;
           }
         }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+          depth * depth, shard);
+  }
 
   void SanityCheckInputs(OpKernelContext* context) {
     const Tensor& input_gradient_tensor = MklGetInput(context, kIdxGradient);
@@ -1317,59 +1281,59 @@ class MklLRNGradOp : public OpKernel {
     const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
     const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
     MklDnnShape in_grads_dnn_shape, in_image_dnn_shape, out_image_dnn_shape,
-                workspace_dnn_shape;
+        workspace_dnn_shape;
     GetMklShape(context, kIdxGradient, &in_grads_dnn_shape);
     GetMklShape(context, kIdxOrigInput, &in_image_dnn_shape);
     GetMklShape(context, kIdxOrigOutput, &out_image_dnn_shape);
     GetMklShape(context, kIdxWorkspace, &workspace_dnn_shape);
     if (in_grads_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_grads_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input gradient must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Input gradient must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-            errors::InvalidArgument("input gradient must be 4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
     }
 
     if (in_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, in_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("input images must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("input images must be "
+                                          "4-dimensional"));
     } else {
       OP_REQUIRES(context, orig_input_tensor.dims() == 4,
                   errors::InvalidArgument("input images must be "
-                  "4-dimensional"));
+                                          "4-dimensional"));
     }
 
     if (out_image_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, out_image_dnn_shape.GetDimension() == 4,
-                errors::InvalidArgument("Output image must be "
-                "4-dimensional"));
+                  errors::InvalidArgument("Output image must be "
+                                          "4-dimensional"));
     } else {
-      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-            errors::InvalidArgument("Output image must be 4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
     }
 
     if (workspace_enabled_) {
       if (workspace_dnn_shape.IsMklTensor()) {
-        OP_REQUIRES(context, workspace_dnn_shape.IsMklTensor() == false,
-              errors::InvalidArgument("Workspace should not be MKL Tensor."));
+        OP_REQUIRES(
+            context, workspace_dnn_shape.IsMklTensor() == false,
+            errors::InvalidArgument("Workspace should not be MKL Tensor."));
       } else {
         OP_REQUIRES(context, workspace_tensor.dims() == 1,
-              errors::InvalidArgument("Workspace must be 1-dimensional"));
+                    errors::InvalidArgument("Workspace must be 1-dimensional"));
       }
     }
   }
 
-// Input("input_grads: T")
-// Input("input_image: T")
-// Input("output_image: T")
-// Input("workspace: uint8")
-  const int kIdxGradient = 0,
-            kIdxOrigInput = 1,
-            kIdxOrigOutput = 2,
-            kIdxWorkspace = 3,
-            kIdxOutput = 0;
+  // Input("input_grads: T")
+  // Input("input_image: T")
+  // Input("output_image: T")
+  // Input("workspace: uint8")
+  const int kIdxGradient = 0, kIdxOrigInput = 1, kIdxOrigOutput = 2,
+            kIdxWorkspace = 3, kIdxOutput = 0;
 
   typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
   bool workspace_enabled_;
@@ -1379,7 +1343,7 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
@@ -1393,7 +1357,6 @@ class MklLRNGradOp : public OpKernel {
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklLRNGradOp<T>);
 
-
 TF_CALL_float(REGISTER_MKL_LRN_CPU);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 82c5229bab0..14607f26e0c 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -22,25 +22,25 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include <algorithm>
 #include "mkldnn.hpp"
-using mkldnn::memory;
-using mkldnn::error;
-using mkldnn::pooling_forward;
-using mkldnn::pooling_backward;
-using mkldnn::padding_kind;
-using mkldnn::engine;
-using mkldnn::prop_kind;
 using mkldnn::algorithm;
+using mkldnn::engine;
+using mkldnn::error;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::prop_kind;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
@@ -397,18 +397,19 @@ class MklMaxPoolingGradOp : public OpKernel {
       if (workspace_enabled == false) {
         if (convert_input != nullptr) {
           if (input_in_mkl_format == false) {
-            CHECK_EQ(
-                dnnConversionExecute_F32(
-                    convert_input, const_cast<void*>(static_cast<const void*>(
-                                       tensor_in.flat<T>().data())),
-                    input_buf),
-                E_SUCCESS);
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
             CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
             convert_input = nullptr;
           } else {
             input_shape.GetConvertedFlatData(
-                lt_input_prim, const_cast<void*>(static_cast<const void*>(
-                                   tensor_in.flat<T>().data())),
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
                 input_buf);
           }
           pooling_resfwd[dnnResourceSrc] = input_buf;
@@ -453,8 +454,9 @@ class MklMaxPoolingGradOp : public OpKernel {
           CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
         } else {
           output_backprop_shape.GetConvertedFlatData(
-              lt_outbackprop_prim, const_cast<void*>(static_cast<const void*>(
-                                       out_backprop.flat<T>().data())),
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
               outbackprop_buf);
         }
         pooling_res[dnnResourceDiffDst] = outbackprop_buf;
@@ -492,14 +494,14 @@ class MklMaxPoolingGradOp : public OpKernel {
   bool workspace_enabled_;
 };  // MklMaxPoolingGradOp
 
-#else  // INTEL_MKL_DNN is defined
+#else
 
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
 class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklMaxPoolingOp(OpKernelConstruction* context)
-            : MklPoolingForwardOpBase<T>(context) {
+      : MklPoolingForwardOpBase<T>(context) {
     // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
     // So we set workspace_enabled_ to true.
     this->workspace_enabled_ = true;
@@ -508,8 +510,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
   void Compute(OpKernelContext* context) override {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
-      const Tensor& input_tensor = MklGetInput(context,
-                this->kInputTensorIndexInput);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
       GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
@@ -522,9 +524,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
       // Get the input tensor and initialize the pooling parameters
-      this->ConfigureInput(context, dnn_shape_input,
-                        input_tensor, &pool_params,
-                        &dnn_data_input);
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
       OP_REQUIRES_OK(context, context->status());
 
       // Declare output tensor
@@ -535,9 +536,10 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // If input is in Mkl layout, then just get the memory format from it
       // directly, instead of using input data_format to MaxPool.
       if (dnn_shape_input.IsMklTensor()) {
-        dnn_data_output.SetUsrMem(output_dims_mkl_order,
-                                  static_cast<memory::format>(
-              dnn_data_input.GetUsrMemDesc().data.format));
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
       } else {
         dnn_data_output.SetUsrMem(output_dims_mkl_order,
                                   this->data_format_mkldnn_);
@@ -546,24 +548,21 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       // describe the memory layout; let mkl-dnn choose the best for the op
       dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
 
-      auto pool_desc = pooling_forward::desc(prop_kind::forward,
-            algorithm::pooling_max,
-            dnn_data_input.GetUsrMemDesc(),
-            dnn_data_output.GetUsrMemDesc(),
-            memory::dims({  pool_params.row_stride,
-                            pool_params.col_stride}),
-            memory::dims({  pool_params.window_rows,
-                            pool_params.window_cols}),
-            memory::dims({  static_cast<int>(pool_params.pad_top),
-                            static_cast<int>(pool_params.pad_left)}),
-            memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                            static_cast<int>(pool_params.pad_right)}),
-            TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc,
-            cpu_engine);
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
 
       this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
-                            this->data_format_mkldnn_, &output_tensor);
+                                 this->data_format_mkldnn_, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       dnn_data_output.SetUsrMemDataHandle(output_tensor);
 
@@ -571,39 +570,38 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       OP_REQUIRES_OK(context, context->status());
 
       this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
-                        &dnn_data_output, &dnn_data_wksp);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+                                 &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    const int kOutputTensorIndexWorkspace = 1;
+  const int kOutputTensorIndexWorkspace = 1;
 
-    void AllocateWorkspaceTensor(OpKernelContext* context,
-                const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-                MklDnnData<uint8>* dnn_data_wksp) {
-        CHECK_NOTNULL(dnn_data_wksp);
-        Tensor* workspace_tensor = nullptr;
-        memory::primitive_desc workspace_pd
-                    = pool_fwd_prim_desc.workspace_primitive_desc();
-        size_t workspace_bytes = workspace_pd.get_size();
-        MklDnnShape workspace_mkl_shape;
-        workspace_mkl_shape.SetMklTensor(false);
-        TensorShape workspace_tf_shape;
-        workspace_tf_shape.AddDim(workspace_bytes);
-        AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
-                                &workspace_tensor,
-                                workspace_tf_shape, workspace_mkl_shape);
-        CHECK_NOTNULL(workspace_tensor);
-        dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
-    }
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        pool_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                              &workspace_tensor, workspace_tf_shape,
+                              workspace_mkl_shape);
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 };
 
 // The operation to compute MaxPool gradients.
@@ -616,221 +614,186 @@ template <class Device, class T>
 class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {
-  }
+      : MklPoolingBackwardOpBase<T>(context) {}
 
   void Compute(OpKernelContext* context) override {
     try {
-        auto cpu_engine = engine(engine::cpu, 0);
-        const Tensor& orig_input_tensor = MklGetInput(context,
-            kInputTensorIndexOrigInput);
-        const Tensor& orig_output_tensor = MklGetInput(context,
-            kInputTensorIndexOrigOutput);
-        const Tensor& grad_tensor = MklGetInput(context,
-            kInputTensorIndexGradient);
-        const Tensor& workspace_tensor = MklGetInput(context,
-            kInputTensorIndexWorkspace);
-        MklDnnShape orig_input_mkl_shape,
-                    orig_output_mkl_shape,
-                    grad_mkl_shape,
-                    workspace_mkl_shape;
-        GetMklShape(context, kInputTensorIndexOrigInput,
-            &orig_input_mkl_shape);
-        GetMklShape(context, kInputTensorIndexOrigOutput,
-            &orig_output_mkl_shape);
-        GetMklShape(context, kInputTensorIndexGradient,
-            &grad_mkl_shape);
-        GetMklShape(context, kInputTensorIndexWorkspace,
-            &workspace_mkl_shape);
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& orig_input_tensor =
+          MklGetInput(context, kInputTensorIndexOrigInput);
+      const Tensor& orig_output_tensor =
+          MklGetInput(context, kInputTensorIndexOrigOutput);
+      const Tensor& grad_tensor =
+          MklGetInput(context, kInputTensorIndexGradient);
+      const Tensor& workspace_tensor =
+          MklGetInput(context, kInputTensorIndexWorkspace);
+      MklDnnShape orig_input_mkl_shape, orig_output_mkl_shape, grad_mkl_shape,
+          workspace_mkl_shape;
+      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexOrigOutput, &orig_output_mkl_shape);
+      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexWorkspace, &workspace_mkl_shape);
 
-        SanityCheckInputs(context,
-                            orig_input_tensor, orig_output_tensor,
-                            grad_tensor, workspace_tensor,
-                            orig_input_mkl_shape, orig_output_mkl_shape,
-                            grad_mkl_shape, workspace_mkl_shape);
-        if (!context->status().ok()) return;
+      SanityCheckInputs(context, orig_input_tensor, orig_output_tensor,
+                        grad_tensor, workspace_tensor, orig_input_mkl_shape,
+                        orig_output_mkl_shape, grad_mkl_shape,
+                        workspace_mkl_shape);
+      if (!context->status().ok()) return;
 
-        MklDnnData<T> grad_dnn_data(&cpu_engine);
-        MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
-        MklDnnData<T> output_dnn_data(&cpu_engine);
-        Tensor* output_tensor = nullptr;
-        MklPoolParameters pool_params;
-        TensorShape orig_input_shape;
-        memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
-        memory::desc original_input_md = ConfigureOriginalInput(context,
-                                orig_input_tensor,
-                                orig_input_mkl_shape,
-                                &orig_input_dims_mkl_order,
-                                &pool_params,
-                                &orig_input_shape);
+      MklDnnData<T> grad_dnn_data(&cpu_engine);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      MklDnnData<T> output_dnn_data(&cpu_engine);
+      Tensor* output_tensor = nullptr;
+      MklPoolParameters pool_params;
+      TensorShape orig_input_shape;
+      memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, orig_input_tensor, orig_input_mkl_shape,
+          &orig_input_dims_mkl_order, &pool_params, &orig_input_shape);
 
-        memory::desc original_output_md = this->ConfigureOriginalOutput(
-                                pool_params,
-                                orig_output_mkl_shape,
-                                output_dims_mkl_order);
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+          pool_params, orig_output_mkl_shape, output_dims_mkl_order);
 
-        memory::desc target_diff_dst_md =  this->ConfigureInputGradient(
-                                        grad_mkl_shape,
-                                        grad_tensor,
-                                        &grad_dnn_data,
-                                        original_output_md);
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+          grad_mkl_shape, grad_tensor, &grad_dnn_data, original_output_md);
 
-        output_dnn_data.SetUsrMem(original_input_md);
+      output_dnn_data.SetUsrMem(original_input_md);
 
-        // Create the forward pooling primitive descriptor so we can
-        // pass it as a hint to the backward pooling primitive descriptor
-        auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
-                algorithm::pooling_max,
-                original_input_md,
-                original_output_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_fwd_prim_desc
-                = pooling_forward::primitive_desc(pool_fwd_desc,
-                                                    cpu_engine);
+      // Create the forward pooling primitive descriptor so we can
+      // pass it as a hint to the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max, original_input_md,
+          original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
 
-        auto pool_bkwd_desc = pooling_backward::desc(
-                algorithm::pooling_max,
-                output_dnn_data.GetUsrMemDesc(),
-                target_diff_dst_md,
-                memory::dims({  pool_params.row_stride,
-                                pool_params.col_stride}),
-                memory::dims({  pool_params.window_rows,
-                                pool_params.window_cols}),
-                memory::dims({  static_cast<int>(pool_params.pad_top),
-                                static_cast<int>(pool_params.pad_left)}),
-                memory::dims({  static_cast<int>(pool_params.pad_bottom),
-                                static_cast<int>(pool_params.pad_right)}),
-                TFPaddingToMklDnnPadding(this->padding_));
-        auto pool_bkwd_prim_desc
-            = pooling_backward::primitive_desc(pool_bkwd_desc,
-                                                cpu_engine,
-                                                pool_fwd_prim_desc);
+      auto pool_bkwd_desc = pooling_backward::desc(
+          algorithm::pooling_max, output_dnn_data.GetUsrMemDesc(),
+          target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
 
-        this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
-            orig_input_dims_mkl_order,
-            this->data_format_mkldnn_,
-            &output_tensor);
-        output_dnn_data.SetUsrMemDataHandle(output_tensor);
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                                 orig_input_dims_mkl_order,
+                                 this->data_format_mkldnn_, &output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor);
 
-        ConfigureWorkspace(workspace_tensor,
-                pool_fwd_prim_desc.workspace_primitive_desc(),
-                &workspace_dnn_data);
-        this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
-                            &grad_dnn_data,
-                            &output_dnn_data,
-                            memory::primitive_desc(
-                                target_diff_dst_md,
-                                cpu_engine),
-                            &workspace_dnn_data);
-    } catch (mkldnn::error &e) {
-        string error_msg = "Status: " + std::to_string(e.status) +
-                        ", message: " + string(e.message) +
-                        ", in file " + string(__FILE__) + ":" +
-                        std::to_string(__LINE__);
-        OP_REQUIRES_OK(context,
-                        errors::Aborted("Compute received an exception:",
-                                         error_msg));
+      ConfigureWorkspace(workspace_tensor,
+                         pool_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
     }
   }  // Compute
 
  private:
-    // .Input("orig_input: T")
-    // .Input("orig_output: T")
-    // .Input("grad: T")
-    // .Input("workspace: T")
-    const int kInputTensorIndexOrigInput = 0;
-    const int kInputTensorIndexOrigOutput = 1;
-    const int kInputTensorIndexGradient = 2;
-    const int kInputTensorIndexWorkspace = 3;
-    //  Output("output: T") in Base Class
+  // .Input("orig_input: T")
+  // .Input("orig_output: T")
+  // .Input("grad: T")
+  // .Input("workspace: T")
+  const int kInputTensorIndexOrigInput = 0;
+  const int kInputTensorIndexOrigOutput = 1;
+  const int kInputTensorIndexGradient = 2;
+  const int kInputTensorIndexWorkspace = 3;
+  //  Output("output: T") in Base Class
 
-    memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                                const Tensor& tensor_original_input,
-                                const MklDnnShape& original_input_mkl_shape,
-                                memory::dims* original_input_dims_mkl_order,
-                                MklPoolParameters* pool_params,
-                                TensorShape* input_tensor_shape) {
-        *input_tensor_shape = tensor_original_input.shape();
-        return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
-                                        context,
-                                        tensor_original_input,
-                                        original_input_mkl_shape,
-                                        original_input_dims_mkl_order,
-                                        pool_params,
-                                        *input_tensor_shape);
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
+    *input_tensor_shape = tensor_original_input.shape();
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+        context, tensor_original_input, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
+
+  void ConfigureWorkspace(const Tensor& workspace_tensor,
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
+    CHECK_NOTNULL(workspace_dnn_data);
+
+    workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+  }
+
+  void SanityCheckInputs(OpKernelContext* context,
+                         const Tensor& orig_input_tensor,
+                         const Tensor& orig_output_tensor,
+                         const Tensor& grad_tensor,
+                         const Tensor& workspace_tensor,
+                         const MklDnnShape& orig_input_mkl_shape,
+                         const MklDnnShape& orig_output_mkl_shape,
+                         const MklDnnShape& grad_mkl_shape,
+                         const MklDnnShape& workspace_mkl_shape) {
+    if (!orig_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
     }
-
-    void ConfigureWorkspace(const Tensor& workspace_tensor,
-                        memory::primitive_desc workspace_pd,
-                        MklDnnData<uint8> *workspace_dnn_data) {
-        CHECK_NOTNULL(workspace_dnn_data);
-
-        workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+    if (!orig_output_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
     }
-
-    void SanityCheckInputs(OpKernelContext* context,
-                            const Tensor& orig_input_tensor,
-                            const Tensor& orig_output_tensor,
-                            const Tensor& grad_tensor,
-                            const Tensor& workspace_tensor,
-                            const MklDnnShape& orig_input_mkl_shape,
-                            const MklDnnShape& orig_output_mkl_shape,
-                            const MklDnnShape& grad_mkl_shape,
-                            const MklDnnShape& workspace_mkl_shape) {
-        if (!orig_input_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_input_tensor.dims() == 4,
-                errors::InvalidArgument("Original input shape must be "
-                "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original input shape must be "
-                    "4-dimensional"));
-        }
-        if (!orig_output_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-                errors::InvalidArgument("Original output must be "
-                        "4-dimensional"));
-        } else {
-            OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Original output must be "
-                    "4-dimensional"));
-        }
-        if (!grad_mkl_shape.IsMklTensor()) {
-            OP_REQUIRES(context, grad_tensor.dims() == 4,
-                errors::InvalidArgument("Gradient must be 4-dimensional"));
-        } else {
-            OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
-                    errors::InvalidArgument("Gradient must be "
-                    "4-dimensional"));
-        }
-        if (this->workspace_enabled_) {
-            // The workspace should not be an MKL tensor
-            OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
-                    errors::InvalidArgument("Workspace tensor should not"
-                                            " be an MKL Tensor."));
-            // It should only have one dimension
-            OP_REQUIRES(context, workspace_tensor.dims() == 1,
-                    errors::InvalidArgument("Workspace tensor must be "
-                                "1-dimensional"));
-        } else {
-            OP_REQUIRES(context, this->workspace_enabled_,
-                    errors::Unimplemented("MKL-DNN Max Pooling does not "
+    if (!grad_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, grad_tensor.dims() == 4,
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Gradient must be "
+                                          "4-dimensional"));
+    }
+    if (this->workspace_enabled_) {
+      // The workspace should not be an MKL tensor
+      OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                  errors::InvalidArgument("Workspace tensor should not"
+                                          " be an MKL Tensor."));
+      // It should only have one dimension
+      OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                  errors::InvalidArgument("Workspace tensor must be "
+                                          "1-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, this->workspace_enabled_,
+          errors::Unimplemented("MKL-DNN Max Pooling does not "
                                 "yet support the use case "
                                 "where MaxPoolGrad is called without first"
                                 " calling MaxPool."));
-        }
     }
+  }
 };  // MklMaxPoolingGradOp
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index f7cadffd39c..5ef6ce2a578 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 
-#include <vector>
-#include <limits>
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include <limits>
+#include <vector>
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -42,7 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -72,7 +72,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -107,21 +107,21 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
     // TF can work with int64, but mkldnn only supports int32
     // Fail if the height or width are greater than MAX_INT
 
-    OP_REQUIRES(context, FastBoundsCheck(out_height,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_height, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output height is too large"));
 
-    OP_REQUIRES(context, FastBoundsCheck(out_width,
-                                         std::numeric_limits<int>::max()),
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_width, std::numeric_limits<int>::max()),
                 errors::InvalidArgument("output width is too large"));
 
 #endif
     out_depth = depth;  // output will have the same depth as the input
-  } else {  // we are pooling in the depth dimension
+  } else {              // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index b974b2c59af..279167aba24 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -17,16 +17,16 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
-#include <vector>
 #include <string>
+#include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::memory;
-using mkldnn::pooling_forward;
 using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
 using mkldnn::stream;
 #endif
 
@@ -61,19 +61,31 @@ struct MklPoolParameters {
 
   TensorFormat data_format;
   MklPoolParameters()
-    : depth(0)
-    , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0)
-    , window_rows(0), window_cols(0), depth_window(0)
-    , row_stride(0), col_stride(0), depth_stride(0)
-    , out_height(0), out_width(0), out_depth(0)
-    , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0)
-    , data_format(TensorFormat::FORMAT_NCHW) {}
+      : depth(0),
+        tensor_in_cols(0),
+        tensor_in_rows(0),
+        tensor_in_batch(0),
+        window_rows(0),
+        window_cols(0),
+        depth_window(0),
+        row_stride(0),
+        col_stride(0),
+        depth_stride(0),
+        out_height(0),
+        out_width(0),
+        out_depth(0),
+        pad_left(0),
+        pad_right(0),
+        pad_top(0),
+        pad_bottom(0),
+        pad_depth(0),
+        data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
@@ -90,39 +102,37 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 template <class T>
 class MklPoolingOpBase : public OpKernel {
  public:
   explicit MklPoolingOpBase(OpKernelConstruction* context)
-            : OpKernel(context)
-            , workspace_enabled_(false) {
-      string data_format;
-      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-      OP_REQUIRES(context,
-            FormatFromString(data_format, &this->data_format_tf_),
-            errors::InvalidArgument("Invalid data format"));
-      this->data_format_mkldnn_
-                = TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
-      OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
-      OP_REQUIRES(context, this->ksize_.size() == 4,
-                  errors::InvalidArgument("Sliding window ksize field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
-      OP_REQUIRES(context, this->stride_.size() == 4,
-                  errors::InvalidArgument("Sliding window strides field must "
-                                          "specify 4 dimensions"));
-      OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
-      OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
-                  errors::Unimplemented("Pooling is not yet supported on the "
-                                        "batch dimension."));
+      : OpKernel(context), workspace_enabled_(false) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
+                errors::InvalidArgument("Invalid data format"));
+    this->data_format_mkldnn_ =
+        TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+    OP_REQUIRES(context, this->ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+    OP_REQUIRES(context, this->stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+    OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
 
-      // We may not get this attribute for this node if it does not go through
-      // graph rewrite pass. So we do not check for error while retrieving this
-      // attribute value.
-      context->GetAttr("workspace_enabled", &this->workspace_enabled_);
-    }
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+  }
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
@@ -132,24 +142,24 @@ class MklPoolingOpBase : public OpKernel {
   // output height and output width to have already been int32
   // bounds-checked
   void GetOutputDims(const MklPoolParameters& mkl_pool_params,
-                    memory::dims* output_dims_mkl_order) {
+                     memory::dims* output_dims_mkl_order) {
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch,
+    *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch,
                               mkl_pool_params.out_depth,
                               static_cast<int>(mkl_pool_params.out_height),
                               static_cast<int>(mkl_pool_params.out_width)};
   }
 
   void InitMklPoolParameters(OpKernelContext* context,
-                      MklPoolParameters* pool_params,
-                      const MklDnnShape& original_input_mkl_shape,
-                      const TensorShape& input_tensor_shape) {
+                             MklPoolParameters* pool_params,
+                             const MklDnnShape& original_input_mkl_shape,
+                             const TensorShape& input_tensor_shape) {
     if (!original_input_mkl_shape.IsMklTensor()) {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, input_tensor_shape);
+                        this->data_format_tf_, input_tensor_shape);
     } else {
       pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
-          this->data_format_tf_, &original_input_mkl_shape);
+                        this->data_format_tf_, &original_input_mkl_shape);
     }
   }
 
@@ -159,13 +169,12 @@ class MklPoolingOpBase : public OpKernel {
   size_t GetNumTElements(const memory::primitive_desc& pd) {
     size_t num_bytes = pd.get_size();
     size_t ret_val = num_bytes / sizeof(T);
-    if ( num_bytes % sizeof(T) != 0 ) {
-        ret_val++;
+    if (num_bytes % sizeof(T) != 0) {
+      ret_val++;
     }
     return ret_val;
   }
 
-
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
@@ -183,30 +192,29 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
 
  protected:
   void ConfigureInput(OpKernelContext* context,
-                    const MklDnnShape& input_mkl_shape,
-                    const Tensor& input_tensor,
-                    MklPoolParameters* pool_params,
-                    MklDnnData<T>* dnn_data_input) {
+                      const MklDnnShape& input_mkl_shape,
+                      const Tensor& input_tensor,
+                      MklPoolParameters* pool_params,
+                      MklDnnData<T>* dnn_data_input) {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md = input_mkl_shape.IsMklTensor()
-                        ? input_mkl_shape.GetMklLayout()
-                        : memory::desc(
-                              TFShapeToMklDnnDimsInNCHW(
-                                  input_tensor_shape, this->data_format_tf_),
-                              MklDnnType<T>(),
-                              this->data_format_mkldnn_);
+    memory::desc input_md =
+        input_mkl_shape.IsMklTensor()
+            ? input_mkl_shape.GetMklLayout()
+            : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                                     this->data_format_tf_),
+                           MklDnnType<T>(), this->data_format_mkldnn_);
     dnn_data_input->SetUsrMem(input_md, &input_tensor);
-    this->InitMklPoolParameters(context, pool_params,
-                      input_mkl_shape, input_tensor_shape);
+    this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
+                                input_tensor_shape);
   }
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_forward::primitive_desc& pool_fwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
 
@@ -215,50 +223,42 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
     TensorShape output_tf_shape;
 
     // only allocate enough space for the elements we need.
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-                  const pooling_forward::primitive_desc& pool_fwd_desc,
-                  const MklDnnData<T>* src,
-                  MklDnnData<T>* dst,
-                  MklDnnData<uint8>* wksp = nullptr) {
+      const pooling_forward::primitive_desc& pool_fwd_desc,
+      const MklDnnData<T>* src, MklDnnData<T>* dst,
+      MklDnnData<uint8>* wksp = nullptr) {
     std::vector<primitive> net;
 
     // Create pooling primitive and add it to net
     if (wksp != nullptr) {
-        net.push_back(pooling_forward(pool_fwd_desc,
-                        src->GetOpMem(),
-                        dst->GetOpMem(),
-                        wksp->GetOpMem()));
+      net.push_back(pooling_forward(pool_fwd_desc, src->GetOpMem(),
+                                    dst->GetOpMem(), wksp->GetOpMem()));
     } else {
-        net.push_back(pooling_forward(pool_fwd_desc,
-            src->GetOpMem(),
-            dst->GetOpMem()));
+      net.push_back(
+          pooling_forward(pool_fwd_desc, src->GetOpMem(), dst->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
 
-
-  void SanityCheckInput(OpKernelContext* context,
-                  const Tensor& input_tensor,
-                  const MklDnnShape& input_mkl_shape) {
+  void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor,
+                        const MklDnnShape& input_mkl_shape) {
     if (!input_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, input_tensor.dims() == 4,
-          errors::InvalidArgument("Input must be 4-dimensional"));
+                  errors::InvalidArgument("Input must be 4-dimensional"));
     } else {
-        OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
-                errors::InvalidArgument("Input shape must be "
-                "4-dimensional"));
+      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Input shape must be "
+                                          "4-dimensional"));
     }
   }
   // .Input("value: T")
@@ -267,66 +267,58 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
   const int kOutputTensorIndexOutput = 0;
 };  // MklPoolingForwardBaseOp
 
-
 template <class T>
 class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
  public:
   explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
-          : MklPoolingOpBase<T>(context) { }
+      : MklPoolingOpBase<T>(context) {}
   void Compute(OpKernelContext* context) override = 0;
 
  protected:
   const int kOutputTensorIndexOutput = 0;
 
-  void AllocateOutputTensor(OpKernelContext* context,
-            const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
-            const memory::dims output_dims_mkl_order,
-            const memory::format& output_tf_format,
-            Tensor** output_tensor) {
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd
-                = pool_bkwd_prim_desc.diff_src_primitive_desc();
+    memory::primitive_desc dst_pd =
+        pool_bkwd_prim_desc.diff_src_primitive_desc();
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
     output_mkl_shape.SetMklLayout(&dst_pd);
     output_mkl_shape.SetElemType(MklDnnType<T>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
-                              output_dims_mkl_order,
-                              output_tf_format);
+                                 output_dims_mkl_order, output_tf_format);
 
     TensorShape output_tf_shape;
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
-    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
-                            output_tensor,
-                            output_tf_shape, output_mkl_shape);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
     CHECK_NOTNULL(*output_tensor);
   }
 
   void PrepareAndExecuteNet(
-    const pooling_backward::primitive_desc& pool_bkwd_desc,
-    MklDnnData<T>* input_gradient_diff_dst,
-    MklDnnData<T>* output_diff_src,
-    const memory::primitive_desc& target_diff_dst_pd,
-    const MklDnnData<uint8>* workspace = nullptr) {
-
+      const pooling_backward::primitive_desc& pool_bkwd_desc,
+      MklDnnData<T>* input_gradient_diff_dst, MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace = nullptr) {
     std::vector<primitive> net;
 
     // If the input gradient isn't in the same format as the output
     // reorder it to the same format as the output
-    input_gradient_diff_dst->CheckReorderToOpMem(
-            target_diff_dst_pd,
-            &net);
+    input_gradient_diff_dst->CheckReorderToOpMem(target_diff_dst_pd, &net);
 
     // Create pooling primitive and add it to net
     if (nullptr == workspace) {
       net.push_back(pooling_backward(pool_bkwd_desc,
-                              input_gradient_diff_dst->GetOpMem(),
-                              output_diff_src->GetOpMem()));
+                                     input_gradient_diff_dst->GetOpMem(),
+                                     output_diff_src->GetOpMem()));
     } else {
-      net.push_back(pooling_backward(pool_bkwd_desc,
-                                  input_gradient_diff_dst->GetOpMem(),
-                                  workspace->GetOpMem(),
-                                  output_diff_src->GetOpMem()));
+      net.push_back(
+          pooling_backward(pool_bkwd_desc, input_gradient_diff_dst->GetOpMem(),
+                           workspace->GetOpMem(), output_diff_src->GetOpMem()));
     }
     stream(stream::kind::eager).submit(net).wait();
   }
@@ -334,80 +326,76 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
   // Max Pooling and Avg Pooling have slightly different implementations
   // Takes the Tensor containing original input data and the original
   // mkl Dnn Shape and populates other data
-  memory::desc ConfigureOriginalInput(OpKernelContext* context,
-                              const Tensor& tensor_original_input_shape,
-                              const MklDnnShape& original_input_mkl_shape,
-                              memory::dims* original_input_dims_nchw,
-                              MklPoolParameters* pool_params,
-                              const TensorShape& input_tensor_shape) {
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_nchw, MklPoolParameters* pool_params,
+      const TensorShape& input_tensor_shape) {
     CHECK_NOTNULL(original_input_dims_nchw);
     CHECK_NOTNULL(pool_params);
-    this->InitMklPoolParameters(context, pool_params,
-                          original_input_mkl_shape,
-                          input_tensor_shape);
+    this->InitMklPoolParameters(context, pool_params, original_input_mkl_shape,
+                                input_tensor_shape);
 
-    *original_input_dims_nchw
-          = original_input_mkl_shape.IsMklTensor()
-          ? original_input_mkl_shape.GetSizesAsMklDnnDims()
-          : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
-        this->data_format_tf_);
+    *original_input_dims_nchw =
+        original_input_mkl_shape.IsMklTensor()
+            ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                        this->data_format_tf_);
 
-    return  original_input_mkl_shape.IsMklTensor()
-      ? original_input_mkl_shape.GetMklLayout()
-      : memory::desc(*original_input_dims_nchw,
-                      MklDnnType<T>(),
-                      this->data_format_mkldnn_);
+    return original_input_mkl_shape.IsMklTensor()
+               ? original_input_mkl_shape.GetMklLayout()
+               : memory::desc(*original_input_dims_nchw, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
-  memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params,
-                                const MklDnnShape& original_output_mkl_shape,
-                                      memory::dims output_dims_mkl_order) {
+  memory::desc ConfigureOriginalOutput(
+      const MklPoolParameters& pool_params,
+      const MklDnnShape& original_output_mkl_shape,
+      memory::dims output_dims_mkl_order) {
     this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
     return original_output_mkl_shape.IsMklTensor()
-            ? original_output_mkl_shape.GetMklLayout()
-            : memory::desc(output_dims_mkl_order,
-                         MklDnnType<T>(),
-                         this->data_format_mkldnn_);
+               ? original_output_mkl_shape.GetMklLayout()
+               : memory::desc(output_dims_mkl_order, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
   }
 
   memory::desc ConfigureInputGradient(
-        const MklDnnShape& input_gradient_mkl_shape,
-        const Tensor& input_gradient_tensor,
-        MklDnnData<T>* input_gradient_dnn_data,
-        const memory::desc& original_output_md) {
+      const MklDnnShape& input_gradient_mkl_shape,
+      const Tensor& input_gradient_tensor,
+      MklDnnData<T>* input_gradient_dnn_data,
+      const memory::desc& original_output_md) {
     // Configure the gradient as is
-    memory::desc original_input_grad_md
-          = input_gradient_mkl_shape.IsMklTensor()
-          ? input_gradient_mkl_shape.GetMklLayout()
-          : memory::desc(TFShapeToMklDnnDimsInNCHW(
-                    input_gradient_tensor.shape(),
-                    this->data_format_tf_),
-                    MklDnnType<T>(), this->data_format_mkldnn_);
+    memory::desc original_input_grad_md =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetMklLayout()
+            : memory::desc(
+                  TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                            this->data_format_tf_),
+                  MklDnnType<T>(), this->data_format_mkldnn_);
 
     input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
-                &input_gradient_tensor);
+                                       &input_gradient_tensor);
 
     // Check to see if input grad diff dst is in the right format
     // Create a new memory descriptor with the same shape as the
     // original, but the format of the other tensors.
     memory::format original_output_format =
-            static_cast<memory::format>(original_output_md.data.format);
-    bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded(
-                                    original_output_format);
-    memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor()
-        ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
-        : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
-                    this->data_format_tf_);
-    memory::desc target_diff_dst_md = memory::desc(diff_dst_dims,
-        MklDnnType<T>(), original_output_format);
+        static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed =
+        input_gradient_dnn_data->IsReorderNeeded(original_output_format);
+    memory::dims diff_dst_dims =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                        this->data_format_tf_);
+    memory::desc target_diff_dst_md =
+        memory::desc(diff_dst_dims, MklDnnType<T>(), original_output_format);
 
-    return grad_reorder_needed
-            ? target_diff_dst_md
-            : original_input_grad_md;
+    return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
   }
 };
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 //-------------------------------------------------------------------
 // Utility functions
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index dc899d8c7ee..0be8355afa4 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,29 +16,29 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
 using mkldnn::algorithm;
-using mkldnn::relu_forward;
-using mkldnn::relu_backward;
-using mkldnn::eltwise_relu;
 using mkldnn::eltwise_elu;
+using mkldnn::eltwise_relu;
 using mkldnn::eltwise_tanh;
+using mkldnn::prop_kind;
+using mkldnn::relu_backward;
+using mkldnn::relu_forward;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
@@ -58,7 +58,7 @@ struct MklReluHelpers {
   }
 };
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
@@ -180,7 +180,6 @@ class MklReluOp : public OpKernel {
   } MklReluOpContext;
 };
 
-
 template <typename Device, typename T>
 class MklReluGradOp : public OpKernel {
  public:
@@ -214,10 +213,11 @@ class MklReluGradOp : public OpKernel {
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
-                   lt_grad), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
+                 E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert), E_SUCCESS);
+                                          mkl_buffer_convert),
+                 E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
@@ -325,7 +325,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
                                      mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope), E_SUCCESS);
+                                     negative_slope),
+           E_SUCCESS);
   Tensor mkl_tmp_input_buf_tensor;
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
@@ -348,7 +349,8 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
     }
 
     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                    mkl_context.output_shape.GetMklLayout())) / sizeof(T));
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
   } else {
@@ -361,22 +363,22 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.relu_res[dnnResourceDiffSrc] =
       static_cast<void*>(output->flat<T>().data());
 
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd,
-                          mkl_context.relu_res),
-                          E_SUCCESS);
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
   mkl_context.MklCleanup();
 }
 
 
-#else  // INTEL_MKL_DNN
+
+#else  // INTEL_MKL_ML
+
 
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
  public:
   ~MklReluOpBase() {}
 
-  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
@@ -413,12 +415,12 @@ class MklReluOpBase : public OpKernel {
 
       T alpha = 0, beta = 0;
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
-      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+      auto relu_fwd_desc = relu_forward::desc(
+          prop_kind::forward_training,
           // Operator memory descriptor is same as user memory descriptor.
-                                              alg_kind, src.GetUsrMemDesc(),
-                                              alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
+          alg_kind, src.GetUsrMemDesc(), alpha, beta);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
 
       // allocate dst tensor
       MklDnnShape dnn_shape_dst;
@@ -431,7 +433,7 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
                                   dnn_shape_src.GetSizesAsMklDnnDims(),
                                   dnn_shape_src.GetTfDataFormat());
-        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
@@ -445,34 +447,32 @@ class MklReluOpBase : public OpKernel {
 
       // execute net
       std::vector<primitive> net;
-      auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(),
-                                   dst.GetOpMem());
+      auto relu_fwd =
+          relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem());
       net.push_back(relu_fwd);
       stream(stream::kind::eager).submit(net).wait();
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                        error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 };
 
-
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluGradOpBase : public OpKernel {
  public:
   ~MklReluGradOpBase() {}
 
-  explicit MklReluGradOpBase(OpKernelConstruction* context) :
-    OpKernel(context) {}
+  explicit MklReluGradOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
-  void Compute(OpKernelContext* context)  {
+  void Compute(OpKernelContext* context) {
     try {
       auto cpu_engine = engine(engine::cpu, 0);
       MklDnnData<T> src(&cpu_engine);
@@ -483,9 +483,9 @@ class MklReluGradOpBase : public OpKernel {
       const size_t src_index = 1;       // index of src input tensor
       const size_t diff_src_index = 0;  // index of diff_src output tensor
 
-      const Tensor& src_tensor      = MklGetInput(context, src_index);
+      const Tensor& src_tensor = MklGetInput(context, src_index);
       const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
-      Tensor* diff_src_tensor       = nullptr;
+      Tensor* diff_src_tensor = nullptr;
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
       GetMklShape(context, src_index, &dnn_shape_src);
@@ -526,25 +526,25 @@ class MklReluGradOpBase : public OpKernel {
         src_md = dnn_shape_src.GetMklLayout();
 
         memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat();
-        auto src_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                    src_mkl_data_format);
+        auto src_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src_mkl_data_format);
         auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(),
                                                        src_tf_data_format);
-        diff_dst_md = memory::desc(diff_dst_dims, MklDnnType<T>(),
-                                   src_mkl_data_format);
+        diff_dst_md =
+            memory::desc(diff_dst_dims, MklDnnType<T>(), src_mkl_data_format);
       } else if (!dnn_shape_src.IsMklTensor() &&
-                  dnn_shape_diff_dst.IsMklTensor()) {
+                 dnn_shape_diff_dst.IsMklTensor()) {
         // Same comment as above.
         diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
 
         memory::format diff_dst_mkl_data_format =
-          dnn_shape_diff_dst.GetTfDataFormat();
-        auto diff_dst_tf_data_format = MklDnnDataFormatToTFDataFormat(
-                                          diff_dst_mkl_data_format);
+            dnn_shape_diff_dst.GetTfDataFormat();
+        auto diff_dst_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format);
         auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
                                                   diff_dst_tf_data_format);
-        src_md = memory::desc(src_dims, MklDnnType<T>(),
-                              diff_dst_mkl_data_format);
+        src_md =
+            memory::desc(src_dims, MklDnnType<T>(), diff_dst_mkl_data_format);
       } else {
         // If both the inputs are in MKL format, we use Mkl layout of the input
         // tensors.
@@ -572,12 +572,12 @@ class MklReluGradOpBase : public OpKernel {
       std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
       auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
                                               alg_kind, src_md, alpha, beta);
-      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
-                                                         cpu_engine));
-      auto relu_bwd_desc = relu_backward::desc(alg_kind, common_md, common_md,
-                                                alpha, beta);
-      auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
-                                                cpu_engine, *relu_fwd_pd);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
+      auto relu_bwd_desc =
+          relu_backward::desc(alg_kind, common_md, common_md, alpha, beta);
+      auto relu_bwd_pd = relu_backward::primitive_desc(
+          relu_bwd_desc, cpu_engine, *relu_fwd_pd);
 
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
@@ -590,33 +590,32 @@ class MklReluGradOpBase : public OpKernel {
         dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
                                        dnn_shape_src.GetSizesAsMklDnnDims(),
                                        dnn_shape_src.GetTfDataFormat());
-        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
         dnn_shape_diff_src.SetMklTensor(false);
         tf_shape_diff_src = src_tensor.shape();
       }
       AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
-                                 tf_shape_diff_src, dnn_shape_diff_src);
+                                tf_shape_diff_src, dnn_shape_diff_src);
 
       // diff_src memory descriptor is same as memory descriptor for both
       // inputs.
       diff_src.SetUsrMem(common_md, diff_src_tensor);
 
       PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
-     } catch (mkldnn::error &e) {
-       string error_msg = "Status: " + std::to_string(e.status) +
-                          ", message: " + string(e.message) +
-                          ", in file " + string(__FILE__) + ":" +
-                          std::to_string(__LINE__);
-       OP_REQUIRES_OK(context,
-                      errors::Aborted("Operation received an exception:",
-                                      error_msg));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
   void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
-                  diff_dst) {
+                            MklDnnData<T>* src, MklDnnData<T>* diff_src,
+                            MklDnnData<T>* diff_dst) {
     std::vector<primitive> net;
 
     // Check if we need to reorder original input tensors into common_md layout
@@ -632,14 +631,13 @@ class MklReluGradOpBase : public OpKernel {
   }
 };
 
-
 template <typename Device, typename T>
 class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluOp() {}
 
-  explicit MklReluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -649,15 +647,15 @@ class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
                               src_tensor.shape(), dnn_shape_dst);
     void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
-              std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+        std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
     return;
   }
 };
@@ -667,14 +665,14 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
  public:
   ~MklReluGradOp() {}
 
-  explicit MklReluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+  explicit MklReluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -687,11 +685,11 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  ((static_cast<T*>(user_i))[0] > 0);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 };
@@ -701,8 +699,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluOp() {}
 
-  explicit MklEluOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -712,8 +710,8 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -734,14 +732,14 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
  public:
   ~MklEluGradOp() {}
 
-  explicit MklEluGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+  explicit MklEluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -754,9 +752,9 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
     // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
     T feature = (static_cast<T*>(user_i))[0];
     if (feature > 0) {
@@ -773,8 +771,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhOp() {}
 
-  explicit MklTanhOp(OpKernelConstruction* context) :
-  MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -784,8 +782,8 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     GetMklShape(context, src_index, &dnn_shape_src);
 
     Tensor* dst_tensor = nullptr;
-    void* user_i = static_cast<void*>(const_cast<T*>(
-                         src_tensor.flat<T>().data()));
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
@@ -795,7 +793,7 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    (static_cast<T*>(out_o))[0] = (e1 - e2)/(e1 + e2);
+    (static_cast<T*>(out_o))[0] = (e1 - e2) / (e1 + e2);
     return;
   }
 };
@@ -805,14 +803,14 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
  public:
   ~MklTanhGradOp() {}
 
-  explicit MklTanhGradOp(OpKernelConstruction* context) :
-  MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+  explicit MklTanhGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
     const size_t src_index = 1;       // index of src input tensor
     const size_t diff_src_index = 0;  // index of diff_src output tensor
-    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& src_tensor = MklGetInput(context, src_index);
     const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
     Tensor* diff_src_tensor = nullptr;
 
@@ -825,16 +823,16 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                               diff_dst_tensor.shape(), dnn_shape_diff_src);
     void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
     void* user_i =
-          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
     // gradient of tanh(x) = 1 - tanh(x)^2
     T feature = (static_cast<T*>(user_i))[0];
     T e1 = std::exp(feature);
     T e2 = std::exp(-feature);
-    T tanh = (e1 - e2)/(e1 + e2);
+    T tanh = (e1 - e2) / (e1 + e2);
     void* user_g =
-          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
-    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
-                                  (1 - tanh * tanh);
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * (1 - tanh * tanh);
   }
 };
 
@@ -854,16 +852,16 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // register dnn kernels for supported operations and supported types
-#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
-  REGISTER_KERNEL_BUILDER(Name("_MklElu")                          \
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklEluOp<CPUDevice, type>);              \
-  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                      \
+                          MklEluOp<CPUDevice, type>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                       \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<type>("T")            \
                               .Label(mkl_op_registry::kMklOpLabel), \
@@ -888,4 +886,3 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
-
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index b41e529357b..5dbc4a2709e 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 #endif
@@ -40,7 +40,7 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -166,9 +166,9 @@ class MklReshapeOp : public OpKernel {
     MklDnnShape mkl_shape_input;
     GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
     bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
-    const int64 nelems = input_in_mkl_format ?
-                         mkl_shape_input.GetTfShape().num_elements()
-                         : input_tensor.NumElements();
+    const int64 nelems = input_in_mkl_format
+                             ? mkl_shape_input.GetTfShape().num_elements()
+                             : input_tensor.NumElements();
 
     // Preliminary validation of sizes.
     OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
@@ -210,11 +210,11 @@ class MklReshapeOp : public OpKernel {
               product));
       shape.set_dim(unknown_index, missing);
     }
-    OP_REQUIRES(context, shape.num_elements() == nelems,
-                errors::InvalidArgument("Input to reshape is a tensor with ",
-                                        nelems,
-                                        " values, but the requested shape has ",
-                                        shape.num_elements()));
+    OP_REQUIRES(
+        context, shape.num_elements() == nelems,
+        errors::InvalidArgument("Input to reshape is a tensor with ", nelems,
+                                " values, but the requested shape has ",
+                                shape.num_elements()));
 
     if (input_in_mkl_format) {
       TensorShape& shape_to = shape;
@@ -237,38 +237,38 @@ class MklReshapeOp : public OpKernel {
           // need to update MklDnnShape object associated with the input
           // tensor to reflect the shape change expected by reshape.
           if (!SkipReorder(mkl_shape_input, shape_to)) {
-              // If dimensions that are being expanded or collapsed are not
-              // maintained contiguously by MKLDNN, then we use reorder.
+            // If dimensions that are being expanded or collapsed are not
+            // maintained contiguously by MKLDNN, then we use reorder.
 
-              // Get Mkl layout of input tensor.
-              auto input_mkl_md = mkl_shape_input.GetMklLayout();
-              // Set input Mkl layout as the user layout.
-              dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
-              // Get expected Tensorflow layout of input tensor.
-              auto output_tf_md = mkl_shape_input.GetTfLayout();
-              auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                         cpu_engine);
+            // Get Mkl layout of input tensor.
+            auto input_mkl_md = mkl_shape_input.GetMklLayout();
+            // Set input Mkl layout as the user layout.
+            dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+            // Get expected Tensorflow layout of input tensor.
+            auto output_tf_md = mkl_shape_input.GetTfLayout();
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
 
-              Tensor* output_tensor = nullptr;
-              MklShape mkl_shape_output;
-              mkl_shape_output.SetMklTensor(false);
-              // We allocate output tensor in the shape expected by Reshape.
-              AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
-                                        shape_to, mkl_shape_output);
+            Tensor* output_tensor = nullptr;
+            MklShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(false);
+            // We allocate output tensor in the shape expected by Reshape.
+            AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                      shape_to, mkl_shape_output);
 
-              // Insert reorder between Mkl layout and TensorFlow layout if
-              // needed. If reorder is not needed but reshape is needed (since
-              // shape_from != shape_to), then we just copy input tensor to
-              // output tensor with target shape (we cannot forward Mkl layout
-              // in such case because shape has changed.)
-              std::vector<primitive> net;
-              if (dnn_data_input.CheckReorderToOpMem(output_tf_pd,
-                       output_tensor, &net)) {
-                stream(stream::kind::eager).submit(net).wait();
-              } else {
-                output_tensor->CopyFrom(input_tensor, shape_to);
-              }
-              return;
+            // Insert reorder between Mkl layout and TensorFlow layout if
+            // needed. If reorder is not needed but reshape is needed (since
+            // shape_from != shape_to), then we just copy input tensor to
+            // output tensor with target shape (we cannot forward Mkl layout
+            // in such case because shape has changed.)
+            std::vector<primitive> net;
+            if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor,
+                                                   &net)) {
+              stream(stream::kind::eager).submit(net).wait();
+            } else {
+              output_tensor->CopyFrom(input_tensor, shape_to);
+            }
+            return;
           } else {
             // If dimensions that are being expanded or collapsed are
             // maintained contiguously by MKLDNN, then we skip reorder, just
@@ -276,10 +276,10 @@ class MklReshapeOp : public OpKernel {
             // Tensorflow tensor as it is to the output.
             auto output_dims = TFShapeToMklDnnDims(shape_to);
             auto output_strides = CalculateTFStrides(output_dims);
-            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(output_dims,
-                                                               output_strides);
-            auto output_tf_pd = memory::primitive_desc(output_tf_md,
-                                                       cpu_engine);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(
+                output_dims, output_strides);
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
 
             // Set MklDnnShape
             MklDnnShape mkl_shape_output;
@@ -291,18 +291,17 @@ class MklReshapeOp : public OpKernel {
 
             // We now simply forward input Mkl tensor to output and change its
             // output MklDnnShape object.
-            ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx,
-                                              kOutputSlotIdx, mkl_shape_output);
+            ForwardMklTensorInToOutWithMklShape(
+                context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output);
             return;
           }
-        } catch (mkldnn::error &e) {
+        } catch (mkldnn::error& e) {
           string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-          OP_REQUIRES_OK(context,
-                   errors::Aborted("Operation received an exception:",
-                      error_msg));
+                             ", message: " + string(e.message) + ", in file " +
+                             string(__FILE__) + ":" + std::to_string(__LINE__);
+          OP_REQUIRES_OK(
+              context,
+              errors::Aborted("Operation received an exception:", error_msg));
         }
       }
     } else {
@@ -313,7 +312,7 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
  private:
   const int kInputSlotIdx = 0;
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index c46eabdde10..aceef1e234e 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -156,5 +156,5 @@ TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index c4d5a45d3ca..5fafa14b5db 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 using mkldnn::stream;
 #endif
 
@@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index 17f2af550f2..0e820bbb620 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -71,10 +71,10 @@ class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
                                         filter.shape().DebugString()));
 
     const int32 in_depth = input.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
     const int32 batch = input.dim_size(0);
     const int32 input_rows = input.dim_size(1);
     const int32 input_cols = input.dim_size(2);
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 64bdef0008f..5d28b87e6bb 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -92,13 +92,11 @@ static inline bool IOUGreaterThanThreshold(
   return iou > iou_threshold;
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context,
-                           const Tensor& boxes,
-                           const Tensor& scores,
-                           const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
+                           const Tensor& scores, const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
@@ -106,10 +104,8 @@ void DoNonMaxSuppressionOp(OpKernelContext* context,
     return;
   }
 
-  const int output_size =
-      std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data =
-      boxes.tensor<float, 2>();
+  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@@ -181,8 +177,7 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -197,10 +192,9 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                iou_threshold.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index fdbcf05b89d..67d9217b950 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -43,9 +43,10 @@ class NonMaxSuppressionOpTest : public OpsTestBase {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
@@ -58,7 +59,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f,  1, -0.1f,
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
                             0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
@@ -71,9 +72,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {2});
   TF_ASSERT_OK(RunOpKernel());
@@ -85,9 +87,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {30});
   TF_ASSERT_OK(RunOpKernel());
@@ -134,9 +137,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) {
 
 TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
   AddInputFromArray<int>(TensorShape({}), {30});
   Status s = RunOpKernel();
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index da825e408c2..7f12eb953a3 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -16,15 +16,15 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #include "tensorflow/core/kernels/nth_element_op.h"
 
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/work_sharder.h"
-#include <vector>
 #include <algorithm>
 #include <iostream>
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
@@ -54,8 +54,9 @@ class NthElementOp : public OpKernel {
                 errors::InvalidArgument("Input must be >= 1-D, got shape ",
                                         input_in.shape().DebugString()));
     // The last dimension of input tensor must be greater than N.
-    OP_REQUIRES(context, input_in.dim_size(num_dims-1) > n,
-                errors::InvalidArgument("Input must have at least n+1 columns"));
+    OP_REQUIRES(
+        context, input_in.dim_size(num_dims - 1) > n,
+        errors::InvalidArgument("Input must have at least n+1 columns"));
 
     // std::nth_element only support the nth-smallest selection.
     if (reverse_) {
@@ -64,7 +65,7 @@ class NthElementOp : public OpKernel {
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1].
     TensorShape out_shape;
-    for (int i = 0; i < num_dims-1; ++i) {
+    for (int i = 0; i < num_dims - 1; ++i) {
       out_shape.AddDim(input_in.dim_size(i));
     }
     Tensor* output_tensor = nullptr;
@@ -83,32 +84,28 @@ namespace functor {
 
 template <typename T>
 struct NthElementFunctor<CPUDevice, T> {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n,
-                   bool reverse) {
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n, bool reverse) {
     const T* input = input_tensor.flat<T>().data();
     T* output = output_tensor.flat<T>().data();
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1],
     // then num_rows = d1*d2...dk-1, last_dim = dk.
     const int num_rows = output_tensor.NumElements();
-    const int last_dim = input_tensor.dim_size(input_tensor.dims()-1);
+    const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);
 
     // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start,
-                                                         int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
       // std::nth_element would rearrange the array, so we need a new buffer.
       std::vector<T> buf(last_dim);
 
       for (int b = start; b < limit; ++b) {
         // Copy from one row of elements to buffer
         const T* input_start = input + b * last_dim;
-        const T* input_end = input + (b+1) * last_dim;
+        const T* input_end = input + (b + 1) * last_dim;
         std::copy(input_start, input_end, buf.begin());
 
-        std::nth_element(buf.begin(), buf.begin()+n, buf.end());
+        std::nth_element(buf.begin(), buf.begin() + n, buf.end());
         // The element placed in the nth position is exactly the element that
         // would occur in this position if the range was fully sorted.
         output[b] = buf[n];
@@ -116,9 +113,9 @@ struct NthElementFunctor<CPUDevice, T> {
     };
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    // The average time complexity of partition-based nth_element (BFPRT) is O(n),
-    // althought the worst time complexity could be O(n^2).
-    // Here, 20 is a empirical factor of cost_per_unit.
+    // The average time complexity of partition-based nth_element (BFPRT) is
+    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
   }
@@ -126,7 +123,6 @@ struct NthElementFunctor<CPUDevice, T> {
 
 }  // namespace functor
 
-
 #define REGISTER_NTHOP(T)                                           \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("NthElement").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
@@ -136,4 +132,3 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_NTHOP);
 #undef REGISTER_NTHOP
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h
index 11a6c996b09..e7d25daecc7 100644
--- a/tensorflow/core/kernels/nth_element_op.h
+++ b/tensorflow/core/kernels/nth_element_op.h
@@ -26,10 +26,8 @@ namespace functor {
 
 template <typename Device, typename T>
 struct NthElementFunctor {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n);
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index 49fd4bdebad..647515ae38a 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -19,16 +19,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/one_hot_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/one_hot_op.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPEC_INDEX(T, TI)                   \
-  template class generator::OneGenerator<T, TI>;       \
+#define DEFINE_GPU_SPEC_INDEX(T, TI)             \
+  template class generator::OneGenerator<T, TI>; \
   template struct functor::OneHot<GPUDevice, T, TI>;
 
 #define DEFINE_GPU_SPEC(T)         \
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 9d53882deef..13427d71ff6 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -218,7 +218,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 0
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}},
+      {{0, 3, 3, 2, 0}, {0, 3}},
+      {{1, 3, 3, 2, 0}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -228,7 +229,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}},
+      {{0, 3, 3, 2, 1}, {0, 2}},
+      {{1, 3, 3, 2, 1}, {1, 2}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -258,7 +260,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) {
 // in_size = 3, ksize = 3, stride = 3, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}},
+      {{0, 3, 3, 3, 1}, {0, 2}},
+      {{1, 3, 3, 3, 1}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -348,8 +351,8 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) {
 
 TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
-  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero,
-  // a multidimensional tensor is always aligned.
+  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is
+  // nonzero, a multidimensional tensor is always aligned.
   Tensor t(DT_FLOAT, TensorShape({3, 4}));
   int64 start = 1;
   int64 end = 2;
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 2033fbf5dc3..5645275cfa9 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -123,7 +123,7 @@ class PackOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -139,7 +139,6 @@ class PackOp : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_PACK);
 TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
-TF_CALL_variant(REGISTER_PACK);
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
 // Primarily used for SavedModel support on mobile.
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index b232ba16a76..0ab9ff9f650 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -95,9 +95,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         int64 sample = b * samples_per_batch;
 
         // On GPU, this check will just fill samples with NAN if it fails.
-        OP_REQUIRES(ctx, stddev > T(0) && minval < maxval &&
-                             (Eigen::numext::isfinite(minval) ||
-                              Eigen::numext::isfinite(maxval)),
+        OP_REQUIRES(ctx,
+                    stddev > T(0) && minval < maxval &&
+                        (Eigen::numext::isfinite(minval) ||
+                         Eigen::numext::isfinite(maxval)),
                     errors::InvalidArgument("Invalid parameters"));
 
         int numIterations = 0;
@@ -118,8 +119,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         // Determine the method to use.
         const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4));
         const T cutoff =
-            T(2) * Eigen::numext::exp(
-                       T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) /
+            T(2) *
+            Eigen::numext::exp(T(0.5) +
+                               (normMin * (normMin - sqrtFactor)) / T(4)) /
             (normMin + sqrtFactor);
         const T diff = normMax - normMin;
         if (diff < cutoff) {
@@ -309,30 +311,34 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
     } else {
       // Parameters must be broadcastable to the shape [num_batches].
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(means_tensor.shape()) ||
-                   means_tensor.dim_size(0) == 1 ||
-                   means_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(means_tensor.shape()) ||
+              means_tensor.dim_size(0) == 1 ||
+              means_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input means should have length 1 or shape[0], got shape: ",
               means_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
-                   stddevs_tensor.dim_size(0) == 1 ||
-                   stddevs_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
+              stddevs_tensor.dim_size(0) == 1 ||
+              stddevs_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input stddevs should have length 1 or shape[0], got shape: ",
               stddevs_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
-                   minvals_tensor.dim_size(0) == 1 ||
-                   minvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
+              minvals_tensor.dim_size(0) == 1 ||
+              minvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input minvals should have length 1 or shape[0], got shape: ",
               minvals_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
-                   maxvals_tensor.dim_size(0) == 1 ||
-                   maxvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
+              maxvals_tensor.dim_size(0) == 1 ||
+              maxvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input maxvals should have length 1 or shape[0], got shape: ",
               maxvals_tensor.shape().DebugString()));
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 933de65c15a..ddfeb1bb790 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -202,12 +202,13 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
                   typename TTypes<T>::Flat output) {
     const auto config = GetCudaLaunchConfig(num_elements, d);
 
-    TruncatedNormalKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        gen, output.data(), num_batches, samples_per_batch, num_elements,
-        means.data(), means.dimension(0) == 1, stddevs.data(),
-        stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1,
-        maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations);
+    TruncatedNormalKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            gen, output.data(), num_batches, samples_per_batch, num_elements,
+            means.data(), means.dimension(0) == 1, stddevs.data(),
+            stddevs.dimension(0) == 1, minvals.data(),
+            minvals.dimension(0) == 1, maxvals.data(),
+            maxvals.dimension(0) == 1, kMaxIterations);
   };
 };
 
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index 6b599612ad7..8e175fe8d4b 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -92,7 +91,6 @@ class SerializeTensorOp : public OpKernel {
       Name("SerializeTensor").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       SerializeTensorOp<T>);
 TF_CALL_ALL_TYPES(REGISTER)
-TF_CALL_variant(REGISTER)
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index a406317213f..01bcfede1e8 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -258,7 +258,7 @@ struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -431,7 +431,7 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -833,7 +833,7 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
index c1bc5af4986..b4bead2456d 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -281,12 +281,11 @@ class MaxPool3DGradSYCL {
 
     const T* input_data_n =
         input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    const T* output_data_n =
-        output_data +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ *
+                                               p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
@@ -678,9 +677,9 @@ class AvgPool3DGradSYCL {
     n /= p_.in_planes_;
 
     T gradient = T(0);
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index e3131b804f2..fc7cb437b8f 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -195,7 +195,6 @@ class MaxPoolingOp : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
@@ -443,7 +442,6 @@ class MaxPoolingV2Op : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index d148c9f78d6..176720c22cc 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -385,8 +385,12 @@ void TestQuantizedToFloatInPlaceUsingEigen(
   // These are the float values we're going to test the conversions on.
   typedef std::pair<float, float> FPair;
   for (FPair min_and_max : std::vector<FPair>{
-           FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f),
-           FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f),
+           FPair(-255.0f, 255.0f),
+           FPair(-1.0f, 1.0f),
+           FPair(-1.0f, 255.0f),
+           FPair(0.0f, 1e6),
+           FPair(0.0f, 1.0f),
+           FPair(-31.0f, 13.0f),
            FPair(-5.89505e+08, 5.89505e+08),
        }) {
     const float f_min = min_and_max.first;
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 1363c7e325b..3b09ea2527d 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -71,7 +71,8 @@ struct QuantizeAndDequantizeOneScaleImpl {
         out.device(d) =
             ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
                  scale +
-             T(0.5)).floor() *
+             T(0.5))
+                    .floor() *
                 inverse_scale +
             min_range;
       } else {
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index d2cc55a94dd..57982bdf76e 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -250,7 +250,8 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) {
   Tensor expected(allocator(), DT_QINT32, TensorShape({element_count}));
   test::FillValues<qint32>(&expected,
                            {
-                               std::numeric_limits<int32>::min(), 0,
+                               std::numeric_limits<int32>::min(),
+                               0,
                                static_cast<int32>(1.0f * (1 << 23)),
                                static_cast<int32>(1.25f * (1 << 23)),
                                static_cast<int32>(1.75f * (1 << 23)),
diff --git a/tensorflow/core/kernels/quantized_batch_norm_op.cc b/tensorflow/core/kernels/quantized_batch_norm_op.cc
index 18d83b41494..b03da7ad17f 100644
--- a/tensorflow/core/kernels/quantized_batch_norm_op.cc
+++ b/tensorflow/core/kernels/quantized_batch_norm_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index d67f1ab3ec2..b03ac8e87da 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -135,8 +135,8 @@ class QuantizedConcatOp : public OpKernel {
           context, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == concat_dim) {
           continue;
@@ -145,8 +145,8 @@ class QuantizedConcatOp : public OpKernel {
             context, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 1921b83d12c..5b3570edff5 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -278,10 +278,9 @@ class Im2ColConvFunctor {
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };
-    OP_REQUIRES_OK(
-        context,
-        context->resource_manager()->LookupOrCreate(
-            "Conv2d", "im2col_buffer", &im2col_buffer_resource, creator));
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
     // This means that multiple ops can't be run simultaneously on different
     // threads, because we have a single shared resource. The platforms this is
     // aimed at have intra-op parallelism as their focus though, so it shouldn't
diff --git a/tensorflow/core/kernels/quantized_instance_norm.cc b/tensorflow/core/kernels/quantized_instance_norm.cc
index c29f534f31b..d62094cc9fa 100644
--- a/tensorflow/core/kernels/quantized_instance_norm.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm.cc
@@ -278,10 +278,10 @@ class QuantizedInstanceNorm : public OpKernel {
     float input_max = context->input(2).flat<float>()(0);
     float input_scale = (input_max - input_min) / 255.0f;
 
-    OP_REQUIRES(
-        context, input_min < input_max,
-        errors::InvalidArgument("input_min must be less than input_max : ",
-                                input_min, " >= ", input_max));
+    OP_REQUIRES(context, input_min < input_max,
+                errors::InvalidArgument(
+                    "input_min must be less than input_max : ", input_min,
+                    " >= ", input_max));
 
     auto input_tensor = input.tensor<quint8, 4>();
     auto N = input_tensor.dimension(0);
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index afb30d5f627..da8c46dc516 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel {
 
     OP_REQUIRES(context,
                 a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size-compatible: In[0]: ", a.shape().DebugString(),
+                    ", In[1]: ", b.shape().DebugString()));
 
     OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)),
                 errors::InvalidArgument("shift_c must be between 0 and 31, "
diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc
index 535b5115c34..c9f05dbc10b 100644
--- a/tensorflow/core/kernels/quantized_matmul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc
@@ -206,17 +206,32 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // We have set the transpose_a flag to true, so the matrix is transposed, and
   // for filling the values the in-memory storage order is effectively
   // column major, rather than the default row-major.
-  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}),
-                            {
-                                11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                            });
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}), {
+                                                               11,
+                                                               10,
+                                                               9,
+                                                               8,
+                                                               7,
+                                                               6,
+                                                               5,
+                                                               4,
+                                                               3,
+                                                               2,
+                                                               1,
+                                                               0,
+                                                           });
 
   // The B matrix is:
   // |   1 |   4|
   // |   2 |   5|
   // |   3 |   6|
   AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {
-                                                               1, 4, 2, 5, 3, 6,
+                                                               1,
+                                                               4,
+                                                               2,
+                                                               5,
+                                                               3,
+                                                               6,
                                                            });
   AddInputFromArray<float>(TensorShape({1}), {-12.0f});
   AddInputFromArray<float>(TensorShape({1}), {243.0f});
@@ -238,10 +253,16 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // |  -50 | -113 |
   // |  -56 | -128 |
   Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols}));
-  test::FillValues<qint32>(&expected,
-                           {
-                               -38, -83, -44, -98, -50, -113, -56, -128,
-                           });
+  test::FillValues<qint32>(&expected, {
+                                          -38,
+                                          -83,
+                                          -44,
+                                          -98,
+                                          -50,
+                                          -113,
+                                          -56,
+                                          -128,
+                                      });
   test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index eaa5e667f7d..3c7536e0373 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -298,9 +298,8 @@ class QuantizedMulOp : public OpKernel {
       return;
     }
     Tensor* z;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_output(0, BCast::ToShape(bcast.output_shape()), &z));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, BCast::ToShape(bcast.output_shape()), &z));
 
     // Make sure that we have valid quantization ranges for the input buffers.
     // If the difference between the min and max is negative or zero, it makes
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index b0550c8260c..a4e407c7a94 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -188,11 +188,12 @@ void TestManualScalar() {
       10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10},
       {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
       3.0f);
-  TestMul({1}, {10.0f}, -100.0f, 100.0f, {10},
-          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
-          10.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f,
-                        90.0f, 100.0f},
-          3.0f);
+  TestMul(
+      {1}, {10.0f}, -100.0f, 100.0f, {10},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {10},
+      {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
+      3.0f);
 }
 
 void TestScalar() {
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 330d161c32b..de495c19cba 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -39,8 +39,8 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
     return errors::Internal(
         "HandleSliceToElement Cannot copy slice: number of elements does not "
         "match.  Shapes are: [element]: ",
-        element->shape().DebugString(), ", [parent slice]: ",
-        chip_shape.DebugString());
+        element->shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
   }
   auto parent_as_matrix = parent.flat_outer_dims<T>();
   element->flat<T>() = parent_as_matrix.chip(index, 0);
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 17831b74370..46a02854d73 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -428,13 +428,14 @@ REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp);
 class QueueIsClosedOp : public QueueOpKernel {
  public:
   explicit QueueIsClosedOp(OpKernelConstruction* context)
-     : QueueOpKernel(context) {}
+      : QueueOpKernel(context) {}
 
  protected:
   void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                     DoneCallback callback) override {
     Tensor* Tqueue_is_closed = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
     Tqueue_is_closed->flat<bool>().setConstant(queue->is_closed());
     callback();
   }
@@ -443,8 +444,10 @@ class QueueIsClosedOp : public QueueOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp);
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
 
 class FakeQueueOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc
index ba94d6be5ca..554909760aa 100644
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/random_crop_op.cc
@@ -68,10 +68,10 @@ class RandomCropOp : public OpKernel {
     // Edge case. The target dimensions are larger then the image, so
     // zero-pad the image. This guarantees that the image will *always*
     // be [target_height, target_width] in size.
-    OP_REQUIRES(
-        context, width >= target_width,
-        errors::FailedPrecondition("width must be >= target_width: width = ",
-                                   width, ", target_width = ", target_width));
+    OP_REQUIRES(context, width >= target_width,
+                errors::FailedPrecondition(
+                    "width must be >= target_width: width = ", width,
+                    ", target_width = ", target_width));
     OP_REQUIRES(context, height >= target_height,
                 errors::FailedPrecondition(
                     "height must be >= target_height: height = ", height,
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 55a8b9c9b67..78ff7948fbf 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -50,7 +50,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -271,9 +271,10 @@ class RandomGammaOp : public OpKernel {
     const Tensor& shape_t = ctx->input(0);
     const Tensor& alpha_t = ctx->input(1);
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(shape_t.shape()) &&
-                         (shape_t.dtype() == DataType::DT_INT32 ||
-                          shape_t.dtype() == DataType::DT_INT64),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(shape_t.shape()) &&
+                    (shape_t.dtype() == DataType::DT_INT32 ||
+                     shape_t.dtype() == DataType::DT_INT64),
                 errors::InvalidArgument(
                     "shape must be a vector of {int32,int64}, got shape: ",
                     shape_t.DebugString()));
@@ -325,7 +326,7 @@ class RandomGammaOp : public OpKernel {
     // avoid a couple flops which can be done on a per-alpha basis.
 
     auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       using Eigen::numext::exp;
       using Eigen::numext::log;
       using Eigen::numext::pow;
@@ -448,40 +449,40 @@ class RandomGammaOp : public OpKernel {
 
 }  // namespace
 
-#define REGISTER(TYPE)                                                      \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE> >;  \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice,                                                            \
-      random::TruncatedNormalDistribution<                                  \
-          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >;       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomUniform")                                                 \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomStandardNormal")                                          \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("TruncatedNormal")                                               \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<                                                       \
-          CPUDevice,                                                        \
-          random::TruncatedNormalDistribution<                              \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);  \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),     \
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;     \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE>>;      \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice,                                                               \
+      random::TruncatedNormalDistribution<                                     \
+          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>;           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          CPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);      \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
       RandomGammaOp<TYPE>)
 
 #define REGISTER_INT(IntType)                                   \
@@ -504,33 +505,33 @@ TF_CALL_int64(REGISTER_INT);
 
 #if GOOGLE_CUDA
 
-#define REGISTER(TYPE)                                              \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomUniform")                                         \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::UniformDistribution<        \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomStandardNormal")                                  \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::NormalDistribution<         \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("TruncatedNormal")                                       \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<                                               \
-          GPUDevice,                                                \
-          random::TruncatedNormalDistribution<                      \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+#define REGISTER(TYPE)                                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          GPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
 #define REGISTER_INT(IntType)                                   \
   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
@@ -565,13 +566,12 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     const size_t kGroupSize = Distribution::kResultElementCount;
@@ -597,7 +597,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
     const typename Distribution::ResultType samples = dist_(&gen_);
     for (size_t i = 0; i < kGroupSize; ++i) {
       if (offset >= size) {
-          return;
+        return;
       }
       data[offset] = samples[i];
       ++offset;
@@ -610,17 +610,15 @@ struct FillPhiloxRandomKernel<Distribution, false> {
   Distribution dist_;
 };
 
-
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     using random::PhiloxRandom;
@@ -628,9 +626,9 @@ struct FillPhiloxRandomKernel<Distribution, true> {
 
     const size_t kReservedSamplesPerOutput = 256;
     const size_t kGroupSize = Distribution::kResultElementCount;
-    const size_t kGeneratorSkipPerOutputGroup = kGroupSize *
-                                                kReservedSamplesPerOutput /
-                                                PhiloxRandom::kResultElementCount;
+    const size_t kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
     const size_t total_item_count = item.get_global_range();
@@ -674,10 +672,9 @@ class FillRandomKernel;
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
-    OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen,
-    typename Distribution::ResultElementType* data, int64 size,
-    Distribution dist) {
-
+    OpKernelContext* context, const SYCLDevice& device,
+    random::PhiloxRandom gen, typename Distribution::ResultElementType* data,
+    int64 size, Distribution dist) {
   const size_t group_size = device.maxSyclThreadsPerBlock();
   const size_t group_count = (size + group_size - 1) / group_size;
 
@@ -686,50 +683,52 @@ void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
   device.sycl_queue().submit([&](sycl::handler& cgh) {
     auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
 
-    FillPhiloxRandomKernel<Distribution, Distribution::kVariableSamplesPerOutput> task(access, gen, dist);
+    FillPhiloxRandomKernel<Distribution,
+                           Distribution::kVariableSamplesPerOutput>
+        task(access, gen, dist);
     cgh.parallel_for<class FillRandomKernel<Distribution>>(
-      sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)),
-      task
-    );
+        sycl::nd_range<1>(sycl::range<1>(group_count * group_size),
+                          sycl::range<1>(group_size)),
+        task);
   });
 }
 
-}
+}  // namespace functor
 
-#define REGISTER(TYPE)                                                       \
-  template struct functor::FillPhiloxRandom<                                 \
-      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomUniform")                                                  \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomStandardNormal")                                           \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("TruncatedNormal")                                                \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<                                                        \
-          SYCLDevice,                                                        \
-          random::TruncatedNormalDistribution<                               \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                  \
+                                     random::PhiloxRandom, TYPE>>);            \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice,                                               \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          SYCLDevice,                                                          \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
-#define REGISTER_INT(IntType)                                    \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")               \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("shape")               \
-                              .HostMemory("minval")              \
-                              .HostMemory("maxval")              \
-                              .TypeConstraint<IntType>("Tout"),  \
+#define REGISTER_INT(IntType)                                   \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
+                              .Device(DEVICE_SYCL)              \
+                              .HostMemory("shape")              \
+                              .HostMemory("minval")             \
+                              .HostMemory("maxval")             \
+                              .TypeConstraint<IntType>("Tout"), \
                           RandomUniformIntOp<SYCLDevice, IntType>);
 
 TF_CALL_float(REGISTER);
@@ -740,6 +739,6 @@ TF_CALL_int64(REGISTER_INT);
 #undef REGISTER
 #undef REGISTER_INT
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 7afa6974c6a..3393b39faf4 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -222,9 +222,8 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
       block_size;
 
-  FillPhiloxRandomKernelLaunch<
-      Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size,
-                                                               dist);
+  FillPhiloxRandomKernelLaunch<Distribution>
+      <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
 };
 
 // Explicit instantiation of the GPU distributions functors
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index bf1d83ec751..64fb4a5c228 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -103,7 +103,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
     typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
 
     auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       // Capturing "rng" by value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "rng" by reference and explicitly do a copy assignment.
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index e9695cfde30..87fc9433316 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -334,96 +334,95 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int32 queue_size = queues_[0].size();
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to RandomShuffleQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_back(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "RandomShuffleQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
-                    }
-                    return kComplete;
-                  }
-                }
-
-                RunResult result = kNoProgress;
-                if (!closed_) queue_size -= min_after_dequeue_;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 queue_size = queues_[0].size();
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
                       attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to RandomShuffleQueue: ",
+                                           s.error_message()));
                     }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int index = attempt->tuple[0].dim_size(0) -
-                                    attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
+                    queues_[j].push_back(element);
                   }
                 }
-                return result;
-              });
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "RandomShuffleQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
+
+            RunResult result = kNoProgress;
+            if (!closed_) queue_size -= min_after_dequeue_;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
+                }
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 36ca7f834f7..15ae4c1fc53 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -312,8 +312,7 @@ __global__ void ColumnReduceKernel(
   int col = blockIdx.x * 32 + threadIdx.x;
 
   value_type sum = initVal;
-  if (row < num_rows && col < num_cols)
-    sum = in[row * num_cols + col];
+  if (row < num_rows && col < num_cols) sum = in[row * num_cols + col];
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
@@ -366,8 +365,7 @@ __global__ void CleanupSegments(
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   value_type val = initVal;
-  if (tid < segment_size * num_cols)
-    val = partial_sums[tid];
+  if (tid < segment_size * num_cols) val = partial_sums[tid];
 
   typedef cub::WarpReduce<value_type> WarpReduce;
 
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index afad288cc00..d52358737fd 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -31,7 +31,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                   \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -113,8 +113,7 @@ namespace functor {
                                                                                \
   template <>                                                                  \
   void Selu<GPUDevice, T>::operator()(                                         \
-      const GPUDevice& d,                                                      \
-      typename TTypes<T>::ConstTensor features,                                \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
       typename TTypes<T>::Tensor activations);                                 \
   extern template struct Selu<GPUDevice, T>;                                   \
                                                                                \
@@ -125,8 +124,6 @@ namespace functor {
       typename TTypes<T>::Tensor backprops);                                   \
   extern template struct SeluGrad<GPUDevice, T>;
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
@@ -157,8 +154,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
       SeluGradOp<GPUDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
@@ -192,10 +187,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
       Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
       SeluGradOp<SYCLDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 24b789c5437..3bc5ba8a50d 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -85,10 +85,9 @@ struct Relu6Grad {
     // make sure not to propagate the associated gradient
     // value. This allows "features" to be either the input or the output of
     // the relu6.
-    backprops.device(d) =
-        gradients *
-        ((features > static_cast<T>(0)) * (features < static_cast<T>(6)))
-            .template cast<T>();
+    backprops.device(d) = gradients * ((features > static_cast<T>(0)) *
+                                       (features < static_cast<T>(6)))
+                                          .template cast<T>();
   }
 };
 
@@ -161,8 +160,8 @@ struct SeluGrad {
     const auto scale = static_cast<T>(1.0507009873554804934193349852946);
     const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
     backprops.device(d) =
-        (activations < static_cast<T>(0)).select(
-            gradients * (activations + scale_alpha), gradients * scale);
+        (activations < static_cast<T>(0))
+            .select(gradients * (activations + scale_alpha), gradients * scale);
   }
 };
 
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 1a9cf4c6406..86e61bbcefc 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 9e10fec4232..25a37d5e1af 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -286,13 +286,14 @@ BM_ResizeBicubicDev(32, 128, 3);
 BM_ResizeBicubicDev(32, 512, 3);
 BM_ResizeBicubicDev(32, 1024, 3);
 
-#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                          \
-  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE *  \
-                            CHANNELS * 8 * 8);                                 \
-    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))         \
-        .Run(iters);                                                           \
-  }                                                                            \
+#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                         \
+  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(         \
+      int iters) {                                                            \
+    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE * \
+                            CHANNELS * 8 * 8);                                \
+    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))        \
+        .Run(iters);                                                          \
+  }                                                                           \
   BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS);
 
 BM_ResizeBicubicExpand(12, 48, 1);
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index a7da7a0777d..f82c3fcd9ff 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -164,11 +164,11 @@ struct ResizeBilinear<GPUDevice, T> {
     if (total_count == 0) return;
 
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, images.data(), height_scale, width_scale,
-        batch, in_height, in_width, channels, out_height, out_width,
-        output.data());
+    ResizeBilinearKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, images.data(), height_scale,
+            width_scale, batch, in_height, in_width, channels, out_height,
+            out_width, output.data());
   }
 };
 
@@ -200,11 +200,11 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     // Accumulate.
     total_count = batch * resized_height * resized_width * channels;
     config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearGradKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input_grad.data(), height_scale,
-        width_scale, batch, original_height, original_width, channels,
-        resized_height, resized_width, output_grad.data());
+    ResizeBilinearGradKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input_grad.data(), height_scale,
+            width_scale, batch, original_height, original_width, channels,
+            resized_height, resized_width, output_grad.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 9cc8e03e3ac..5b4aad3cdd8 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -387,7 +387,6 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-TF_CALL_variant(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
@@ -635,6 +634,9 @@ class ResourceScatterUpdateOp : public OpKernel {
 
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
 
+REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
+
 // Registers GPU kernels.
 #if GOOGLE_CUDA
 #define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 8f82784d936..bb96c42f10c 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -269,10 +269,10 @@ class ReverseV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
-// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of
-// a single dimension to the dims=3 or dims=2 case, regardless of the number
-// of dimensions in the tensor. This would let some ops use faster
-// lower-dimension code (and use optimized versions).
+      // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
+      // of a single dimension to the dims=3 or dims=2 case, regardless of the
+      // number of dimensions in the tensor. This would let some ops use faster
+      // lower-dimension code (and use optimized versions).
 
 #define HANDLE_REVERSE(NDIMS)                                           \
   case NDIMS:                                                           \
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index b05a7c55504..3ee49db669f 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
 #define DEFINE_REVERSE(T, DIM) \
   template struct functor::Reverse<GPUDevice, T, DIM>;
 #define DEFINE_REVERSE_ALL_DIMS(T) \
-  DEFINE_REVERSE(T, 0) \
-  DEFINE_REVERSE(T, 1) \
-  DEFINE_REVERSE(T, 2) \
-  DEFINE_REVERSE(T, 3) \
-  DEFINE_REVERSE(T, 4) \
-  DEFINE_REVERSE(T, 5) \
-  DEFINE_REVERSE(T, 6) \
-  DEFINE_REVERSE(T, 7) \
+  DEFINE_REVERSE(T, 0)             \
+  DEFINE_REVERSE(T, 1)             \
+  DEFINE_REVERSE(T, 2)             \
+  DEFINE_REVERSE(T, 3)             \
+  DEFINE_REVERSE(T, 4)             \
+  DEFINE_REVERSE(T, 5)             \
+  DEFINE_REVERSE(T, 6)             \
+  DEFINE_REVERSE(T, 7)             \
   DEFINE_REVERSE(T, 8)
 
 TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index d1980d4b652..15a707a9c66 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -51,8 +51,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
   // Copy seq_len info down for validity checks
   context->eigen_device<Device>().memcpyDeviceToHost(
-      seq_lens_vec.data(), seq_lens_t.data(),
-      sizeof(Tlen) * seq_lens_t.size());
+      seq_lens_vec.data(), seq_lens_t.data(), sizeof(Tlen) * seq_lens_t.size());
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
@@ -76,8 +75,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   }
 }
 
-void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
   const Tensor& seq_lens = context->input(1);
 
@@ -98,13 +96,13 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
 
 template <>
 void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
 template <>
 void CheckErrors<GPUDevice, int64>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
@@ -164,14 +162,15 @@ class ReverseSequenceOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
 };
 
-#define REGISTER_REVERSE_SEQUENCE(type, len_type)                           \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<CPUDevice, type, len_type>);
+#define REGISTER_REVERSE_SEQUENCE(type, len_type)                \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<CPUDevice, type, len_type>);
 
-#define REGISTER_REVERSE_SEQUENCE_LEN(type)    \
-  REGISTER_REVERSE_SEQUENCE(type, int32);      \
+#define REGISTER_REVERSE_SEQUENCE_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE(type, int32);   \
   REGISTER_REVERSE_SEQUENCE(type, int64);
 
 TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN);
@@ -181,23 +180,23 @@ TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                     \
-  template <>                                                               \
-  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(                  \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,      \
-      int32 batch_dim, int32 seq_dim,                                       \
-      typename TTypes<Tlen>::ConstVec seq_lens,                             \
-      typename TTypes<T, Dims>::Tensor output);                             \
+#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                \
+  template <>                                                          \
+  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+      int32 batch_dim, int32 seq_dim,                                  \
+      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
-#define DECLARE_GPU_SPEC_LEN(T, Dims)    \
-  DECLARE_GPU_SPEC(T, int32, Dims);      \
+#define DECLARE_GPU_SPEC_LEN(T, Dims) \
+  DECLARE_GPU_SPEC(T, int32, Dims);   \
   DECLARE_GPU_SPEC(T, int64, Dims);
 
-#define DECLARE_GPU_SPECS(T)     \
-  DECLARE_GPU_SPEC_LEN(T, 2);    \
-  DECLARE_GPU_SPEC_LEN(T, 3);    \
-  DECLARE_GPU_SPEC_LEN(T, 4);    \
+#define DECLARE_GPU_SPECS(T)  \
+  DECLARE_GPU_SPEC_LEN(T, 2); \
+  DECLARE_GPU_SPEC_LEN(T, 3); \
+  DECLARE_GPU_SPEC_LEN(T, 4); \
   DECLARE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
@@ -206,14 +205,15 @@ TF_CALL_bool(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)                       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<GPUDevice, type, len_type>);
+#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)            \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<GPUDevice, type, len_type>);
 
-#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type)   \
-  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);     \
+#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);   \
   REGISTER_REVERSE_SEQUENCE_GPU(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN);
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
index cb49f14525a..4a2136a2cd3 100644
--- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
   template class generator::ReverseGenerator<T, Tlen, dims>; \
   template struct functor::ReverseSequence<GPUDevice, T, Tlen, dims>;
 
-#define DEFINE_GPU_SPEC_LEN(T, dims)  \
-  DEFINE_GPU_SPEC(T, int32, dims);    \
+#define DEFINE_GPU_SPEC_LEN(T, dims) \
+  DEFINE_GPU_SPEC(T, int32, dims);   \
   DEFINE_GPU_SPEC(T, int64, dims);
 
-#define DEFINE_GPU_SPECS(T) \
-  DEFINE_GPU_SPEC_LEN(T, 2);    \
-  DEFINE_GPU_SPEC_LEN(T, 3);    \
-  DEFINE_GPU_SPEC_LEN(T, 4);    \
+#define DEFINE_GPU_SPECS(T)  \
+  DEFINE_GPU_SPEC_LEN(T, 2); \
+  DEFINE_GPU_SPEC_LEN(T, 3); \
+  DEFINE_GPU_SPEC_LEN(T, 4); \
   DEFINE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
new file mode 100644
index 00000000000..bcbdbee058b
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -0,0 +1,334 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/register_types_traits.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+#define EIGEN_USE_THREADS
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+template <typename T>
+void DoRoll(OpKernelContext* context, const int64 num_elements,
+            const int num_dims, const gtl::ArraySlice<int>& dim_size,
+            const T* input, T* output, const gtl::ArraySlice<int>& threshold,
+            const gtl::ArraySlice<int64>& dim_range) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
+                  int64 start, int64 end) {
+    // array of indices for each dimension
+    gtl::InlinedVector<int, 4> indices(num_dims);
+    int offset = 0;  // the shift along the flattened tensor for current element
+    // initialize indices and offset
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indices[i] = indx;
+      // calculate dimension index after the shift
+      const int shifted_indx = (indx + shift) % dim_size[i];
+      offset += (shifted_indx - indx) * stride;
+    }
+
+    for (int64 i = start; i < end; i++) {
+      output[i + offset] = input[i];
+      // create next combination of indices
+      // while at it adjust offset if needed
+      for (int j = num_dims - 1; j >= 0; j--) {
+        const int indx = (indices[j] + 1) % dim_size[j];
+        indices[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {  // we've reached the threshold
+            // dim_range[j] = threshold[j] + shift[j]
+            // offset = shift[j] + ... other offsets
+            // offset - dim_range[j] = -threshold[j] + ... other offsets
+            // thus we undo our previous offset as well as add a new offset of
+            // -threshold[j] in one operation
+            offset -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          offset += dim_range[j];        // indx became 0 so reverse wrap around
+        }
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  // 15 - expiramentally determined with float and bool types
+  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
+        cost_per_element, std::move(work));
+}
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+// isd - inner shift dimension
+template <typename T>
+// Use memcpy to copy memory in groups when the data type supports memcpy
+void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements,
+                      const int num_dims, const gtl::ArraySlice<int>& dim_size,
+                      const T* input, T* output,
+                      const gtl::ArraySlice<int>& threshold,
+                      const gtl::ArraySlice<int64>& dim_range,
+                      const int64 isd) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
+                  int64 start, int64 end) {
+    // the number of indices over in the flattened tensor you need to skip in
+    // order to make it over from one side of the isd to the other
+    const int64 isd_range = std::max<int>(dim_range[isd], 1);
+    // the distance along the flattend tensor to the next element in the isd
+    const int64 isd_stride = isd_range / std::max<int>(dim_size[isd], 1);
+
+    // start and end represent the i-th group currently so we will convert
+    // them into numbers representing the i-th elements.
+    // there are 2 groups per isd one for all elements before threshold[isd]
+    // and another for all elements after threshold[isd].
+    const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride;
+    const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride;
+    start = (start / 2) * isd_range + start_remainder;
+    end = (end / 2) * isd_range + end_remainder;
+
+    const T* in_ptr = &input[0];
+    T* out_ptr = &output[0];
+    in_ptr += start;
+    out_ptr += start;
+
+    // array of indices for each dimension
+    // indicies = [i, j, k, l, m, n]
+    gtl::InlinedVector<int, 4> indicies(num_dims);
+    // the offset needed to make all inner non-shifting dimensions become 0
+    int64 remainder_offset = 0;
+    // initialize indicies
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indicies[i] = indx;
+      // calculate dimension index after the shift
+      int out_indx = (indx + shift) % dim_size[i];
+      if (i > isd) {
+        // trailing zeroes for indices after the inner shifted dimension
+        out_indx = 0;
+        remainder_offset += (out_indx - indx) * stride;
+      }
+      out_ptr += (out_indx - indx) * stride;
+    }
+    // set trailing zeroes for indices after the inner shifted dimension
+    for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0;
+
+    // the number of indices in the isd dimension the next group will skip
+    // to make it to the next threshold or end point
+    int isd_indx_skip = 0;
+    // the size of the next group
+    int64 group_size = 0;
+    // initialize isd_indx_skip and group_size
+    if (indicies[isd] < threshold[isd]) {
+      isd_indx_skip = threshold[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    } else {
+      isd_indx_skip = dim_size[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    }
+
+    int64 i = start;
+    while (i < end) {
+      // copy group of elements
+      memcpy(out_ptr, in_ptr, group_size * sizeof(T));
+
+      // shift i and the pointers over to the next group position
+      i += group_size;
+      out_ptr += group_size;
+      in_ptr += group_size;
+
+      // produce next combination of indices and adjust the out_ptr position
+      // to fix the offset if necessary
+      // the isd (inner shift dim) should skip to next threshold or endpoint
+      // all dimensions to the left increment by 1 when a digit is carried
+      // all dimensions to the right remain set to 0
+      //            +1 +1 +1 +isd_indx_skip
+      // indicies = [i, j, k, l, 0, 0]
+      //                      ^isd
+      for (int j = isd; j >= 0; j--) {
+        int inc = 1;
+        if (j == isd) inc = isd_indx_skip;
+        const int indx = (indicies[j] + inc) % dim_size[j];
+        indicies[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {
+            out_ptr -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          out_ptr += dim_range[j];       // indx became 0 so reverse wrap around
+        }
+      }
+
+      // set isd_indx_skip and group_size for next iteration
+      if (indicies[isd] < threshold[isd]) {
+        isd_indx_skip = threshold[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      } else {
+        isd_indx_skip = dim_size[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  const int64 ave_group_size = dim_range[isd] / 2;
+  const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1);
+  // 25000 - expiramentally determined with float and bool types
+  const int cost_per_group = 25000 * sizeof(T) * ave_group_size;
+  Shard(worker_threads->num_threads, worker_threads->workers, total_work,
+        cost_per_group, std::move(work));
+}
+
+template <typename Device, typename T, typename Tshift, typename Taxis>
+class RollOp : public OpKernel {
+ public:
+  explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input = context->input(0);
+    const Tensor& shift = context->input(1);
+    const Tensor& axis = context->input(2);
+
+    auto shift_flat = shift.flat<Tshift>();
+    auto axis_flat = axis.flat<Taxis>();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument("input must be 1-D or higher"));
+    OP_REQUIRES(context, shift.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "shift must be a scalar or a 1-D vector. Found: ",
+                    shift.shape().DebugString()));
+    OP_REQUIRES(context, axis.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "axis must be a scalar or a 1-D vector. Found: ",
+                    axis.shape().DebugString()));
+    OP_REQUIRES(
+        context, shift.shape() == axis.shape(),
+        errors::InvalidArgument("shift and axis must have the same size"));
+    const int64 num_elements = input.NumElements();
+    const int num_shifts = static_cast<int>(shift_flat.size());
+    const int num_dims = input.dims();
+
+    // if there are any duplicate axes, shift_mod_sum will have the
+    // total modulo sum of shifts for each dimension
+    gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
+    for (int i = 0; i < num_shifts; i++) {
+      const int axis = axis_flat(i);
+      OP_REQUIRES(context, axis < num_dims,
+                  errors::InvalidArgument("axis ", axis, " is out of range"));
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
+      const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
+      // modulo that works with negatives: ((x % y) + y) % y
+      shift_mod_sum[axis] = (sum % ds + ds) % ds;
+    }
+    // the size of each dimension
+    gtl::InlinedVector<int, 4> dim_size(num_dims);
+    // threshold[i] is the index that the roll starts to wrap back to the front
+    gtl::InlinedVector<int, 4> threshold(num_dims);
+    // dim_range is the number of indices over in the flattened tensor
+    // you need to skip in order to make it over from one side of a dimension
+    // to the other. Used to make the shifts wrap around after a threshold.
+    gtl::InlinedVector<int64, 4> dim_range(num_dims);
+    int64 dim_size_prod = 1;  // dimension size product
+    // inner shift dimension (inner most shifted dimension)
+    int64 isd = 0;
+    for (int i = num_dims - 1; i >= 0; i--) {
+      if (isd == 0 && shift_mod_sum[i] != 0) isd = i;
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(i)), 1);
+      dim_size[i] = ds;
+      threshold[i] = (ds - shift_mod_sum[i]) % ds;
+      dim_size_prod *= static_cast<int64>(input.dim_size(i));
+      dim_range[i] = dim_size_prod;
+    }
+
+    Tensor* output = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    auto input_flat = input.flat<T>().data();
+    auto output_flat = output->flat<T>().data();
+
+    if (std::is_same<Device, CPUDevice>::value) {
+      if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        // V2 copies memory in groups instead of element by element
+        DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size,
+                            input_flat, output_flat, threshold, dim_range, isd);
+      } else {
+        // incase memcpy does not work for current data type
+        DoRoll<T>(context, num_elements, num_dims, dim_size, input_flat,
+                  output_flat, threshold, dim_range);
+      }
+    }
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int64>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int64>)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
new file mode 100644
index 00000000000..90b6f8d0f30
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -0,0 +1,484 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class RollOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType data_type, DataType index_type) {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "Roll")
+                     .Input(FakeInput(data_type))
+                     .Input(FakeInput(index_type))
+                     .Input(FakeInput(index_type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RollOpTest, ScalarIndices) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+  test::FillValues<float>(&expected, {2, 3, 4, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5}));
+  test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_Complex) {
+  MakeOp(DT_COMPLEX64, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<std::complex<float>>(
+      TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+                         std::complex<float>(2, 12), std::complex<float>(3, 13),
+                         std::complex<float>(4, 14)});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5}));
+  test::FillValues<std::complex<float>>(
+      &expected, {std::complex<float>(2, 12), std::complex<float>(3, 13),
+                  std::complex<float>(4, 14), std::complex<float>(0, 10),
+                  std::complex<float>(1, 11)});
+  test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
+                                       "o", "k", "b", "c", "d", "e", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 3}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 3}));
+  test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
+                                       "k", "o", "m", "n", "c", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4, 1, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3}));
+  test::FillValues<float>(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({4, 1, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3}));
+  test::FillValues<string>(
+      &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({1, 1, 1}), {5});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1}));
+  test::FillValues<float>(&expected, {5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1}));
+  test::FillValues<string>(&expected, {"a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
+                                       "e", "a", "g", "h", "i", "j", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({}), {7});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("axis must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1}), {1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift and axis must have the same size"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+}
+
+// isd - (inner shift dimension) The inner most dimension to be shifted.
+//    All outer dimensions will also be shifted for testing.
+static Graph* RollGraph(const TensorShape& shape, int isd) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor input(DT_FLOAT, shape);
+  input.flat<float>().setRandom();
+  const int dims = static_cast<int>(input.dims());
+  Tensor shift(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    // shift the inner shift dimension and all outer dimensions
+    shift.flat<int32>()(i) = (i <= isd) ? 2 : 0;
+  }
+  Tensor axis(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    axis.flat<int32>()(i) = i;
+  }
+  test::graph::Roll(g, test::graph::Constant(g, input),
+                    test::graph::Constant(g, shift),
+                    test::graph::Constant(g, axis));
+  return g;
+}
+
+#define BM_ROLL_OUTER(DEVICE)                                                 \
+  static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) {    \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_outer)                                         \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+#define BM_ROLL_ALL(DEVICE)                                                   \
+  static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) {      \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_all)                                           \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+BM_ROLL_OUTER(cpu);
+BM_ROLL_ALL(cpu);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index df60eda7597..990bd2bff94 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -106,11 +106,11 @@ void SaveTensors(
       OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
                                   shape_spec, &shape, &slice, &slice_shape));
       OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()),
-                  errors::InvalidArgument("Slice in shape_and_slice "
-                                          "specification does not match the "
-                                          "shape of the tensor to  save: ",
-                                          shape_spec, ", tensor: ",
-                                          input.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "Slice in shape_and_slice "
+                      "specification does not match the "
+                      "shape of the tensor to  save: ",
+                      shape_spec, ", tensor: ", input.shape().DebugString()));
     }
 
 #define WRITER_ADD(T)                                           \
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index c6e35fe329e..079f15e1013 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -29,7 +29,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace scatter_op {
 
@@ -117,7 +117,7 @@ struct AssignSYCL<scatter_op::UpdateOp::DIV> {
     p.device(d) = p / u;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace scatter_op
@@ -156,7 +156,7 @@ struct ScatterFunctorBase {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
+struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
   Index operator()(OpKernelContext* c, const SYCLDevice& d,
                    typename TTypes<T>::Matrix params,
                    typename TTypes<T>::ConstMatrix updates,
@@ -171,13 +171,13 @@ struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(d, params.template chip<0>(index),
-                                            updates.template chip<0>(i));
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), updates.template chip<0>(i));
     }
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
@@ -217,7 +217,7 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
-        : ScatterFunctorBase<CPUDevice, T, Index, op>{};
+    : ScatterFunctorBase<CPUDevice, T, Index, op> {};
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
@@ -239,7 +239,7 @@ struct ScatterFunctorSYCL {
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index e116077d3cf..be18658543e 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -30,9 +30,10 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
-__global__ void ScatterOpCustomKernel(
-    T* params, const T* updates, const Index* indices,
-    Index first_dim_size, Index updates_size, Index indices_size) {
+__global__ void ScatterOpCustomKernel(T* params, const T* updates,
+                                      const Index* indices,
+                                      Index first_dim_size, Index updates_size,
+                                      Index indices_size) {
   Index update_block = updates_size / indices_size;
   CUDA_1D_KERNEL_LOOP(i, updates_size) {
     int indices_i = i / update_block;
@@ -85,8 +86,8 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
     ScatterOpCustomKernel<T, Index, op>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), updates.data(), indices.data(),
-            first_dim_size, updates_size, indices_size);
+            params.data(), updates.data(), indices.data(), first_dim_size,
+            updates_size, indices_size);
     return -1;
   }
 };
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index c6c9d4e6588..e82660dcc1d 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -40,7 +40,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -251,7 +251,7 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32);
 #undef REGISTER_SCATTER_ND_INDEX_SYCL
 #undef REGISTER_SCATTER_ND_FULL_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 8607c7f95af..282165349f3 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -33,7 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
@@ -102,11 +102,12 @@ class ScatterUpdateOp : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -137,7 +138,7 @@ class ScatterUpdateOp : public OpKernel {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
+class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
  public:
   explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
@@ -165,11 +166,12 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -206,7 +208,7 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 863c123b43f..066a4b80a2b 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -37,9 +37,8 @@ void FeatureWeightsDenseStorage::UpdateDenseDeltaWeights(
   const size_t num_weight_vectors = normalized_bounded_dual_delta.size();
   if (num_weight_vectors == 1) {
     deltas_.device(device) =
-        deltas_ +
-        dense_vector.RowAsMatrix() *
-            deltas_.constant(normalized_bounded_dual_delta[0]);
+        deltas_ + dense_vector.RowAsMatrix() *
+                      deltas_.constant(normalized_bounded_dual_delta[0]);
   } else {
     // Transform the dual vector into a column matrix.
     const Eigen::TensorMap<Eigen::Tensor<const double, 2, Eigen::RowMajor>>
@@ -61,9 +60,8 @@ void FeatureWeightsSparseStorage::UpdateSparseDeltaWeights(
     const Example::SparseFeatures& sparse_features,
     const std::vector<double>& normalized_bounded_dual_delta) {
   for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
-    const double feature_value = sparse_features.values == nullptr
-                                     ? 1.0
-                                     : (*sparse_features.values)(k);
+    const double feature_value =
+        sparse_features.values == nullptr ? 1.0 : (*sparse_features.values)(k);
     auto it = indices_to_id_.find((*sparse_features.indices)(k));
     for (size_t l = 0; l < normalized_bounded_dual_delta.size(); ++l) {
       deltas_(l, it->second) +=
@@ -122,23 +120,24 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
   }
 
   // Reads in the weights, and allocates and initializes the delta weights.
-  const auto initialize_weights = [&](
-      const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
-      std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
-    for (int i = 0; i < weight_inputs.size(); ++i) {
-      Tensor* delta_t;
-      TF_RETURN_IF_ERROR(
-          weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
-      // Convert the input vector to a row matrix in internal representation.
-      auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
-      deltas.setZero();
-      feature_weights->emplace_back(
-          FeatureWeightsDenseStorage{weight_inputs[i].shaped<float, 2>(
-                                         {1, weight_inputs[i].NumElements()}),
-                                     deltas});
-    }
-    return Status::OK();
-  };
+  const auto initialize_weights =
+      [&](const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
+          std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
+        for (int i = 0; i < weight_inputs.size(); ++i) {
+          Tensor* delta_t;
+          TF_RETURN_IF_ERROR(
+              weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
+          // Convert the input vector to a row matrix in internal
+          // representation.
+          auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
+          deltas.setZero();
+          feature_weights->emplace_back(FeatureWeightsDenseStorage{
+              weight_inputs[i].shaped<float, 2>(
+                  {1, weight_inputs[i].NumElements()}),
+              deltas});
+        }
+        return Status::OK();
+      };
 
   return initialize_weights(dense_weights_inputs, &dense_weights_outputs,
                             &dense_weights_);
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 9f072700754..45915693ac6 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -149,7 +149,8 @@ class Example {
   // 1.0f.
   struct SparseFeatures {
     std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
-    std::unique_ptr<TTypes<const float>::UnalignedConstVec> values;  // nullptr encodes optional.
+    std::unique_ptr<TTypes<const float>::UnalignedConstVec>
+        values;  // nullptr encodes optional.
   };
 
   // A dense vector which is a row-slice of the underlying matrix.
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 0f5c2424b38..dbe0177dda3 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -57,11 +57,11 @@ namespace tensorflow {
 
 namespace {
 
-using sdca::Regularizations;
 using sdca::Example;
 using sdca::Examples;
 using sdca::ExampleStatistics;
 using sdca::ModelWeights;
+using sdca::Regularizations;
 
 struct ComputeOptions {
   explicit ComputeOptions(OpKernelConstruction* const context) {
@@ -76,8 +76,9 @@ struct ComputeOptions {
     } else if (loss_type == "smooth_hinge_loss") {
       loss_updater.reset(new SmoothHingeLossUpdater);
     } else {
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unsupported loss type: ", loss_type));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unsupported loss type: ", loss_type));
     }
     OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative));
     OP_REQUIRES_OK(
@@ -90,9 +91,10 @@ struct ComputeOptions {
         context, num_sparse_features + num_dense_features > 0,
         errors::InvalidArgument("Requires at least one feature to train."));
 
-    OP_REQUIRES(context, static_cast<int64>(num_sparse_features) +
-                                 static_cast<int64>(num_dense_features) <=
-                             std::numeric_limits<int>::max(),
+    OP_REQUIRES(context,
+                static_cast<int64>(num_sparse_features) +
+                        static_cast<int64>(num_dense_features) <=
+                    std::numeric_limits<int>::max(),
                 errors::InvalidArgument(
                     strings::Printf("Too many feature groups: %lld > %d",
                                     static_cast<int64>(num_sparse_features) +
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 3ef1cd1e062..27b8081eb88 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -115,7 +115,7 @@ class SegmentReductionOp : public OpKernel {
     Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce;
     dims_to_reduce[0] = 0;
 #else
-    Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce;
+    Eigen::IndexList<Eigen::type2index<0> > dims_to_reduce;
 #endif
     Index start = 0, end = 1;
 
@@ -359,7 +359,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 namespace functor {
 
 // UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and UnsortedSegmentMaxFunctor.
+// todo: Remove duplicate code in UnsortedSegmentSumFunctor and
+// UnsortedSegmentMaxFunctor.
 template <typename T, typename Index>
 struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
     : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
@@ -461,9 +462,10 @@ class UnsortedSegmentBaseOp : public OpKernel {
 
     auto data_ptr = data.template flat<T>().data();
     reduction_functor_(context, context->template eigen_device<Device>(),
-                     output_rows, segment_ids.shape(), segment_flat,
-                     data.NumElements(), data_ptr, output_flat);
+                       output_rows, segment_ids.shape(), segment_flat,
+                       data.NumElements(), data_ptr, output_flat);
   }
+
  private:
   functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
 };
@@ -472,22 +474,20 @@ template <typename Device, class T, class Index>
 class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            sum_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, sum_functor_) {}
+
  private:
-    functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+  functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
 };
 
 template <typename Device, class T, class Index>
 class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
  public:
   explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            max_functor_) {}
+      : UnsortedSegmentBaseOp<Device, T, Index>(context, max_functor_) {}
+
  private:
-    functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
+  functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
 };
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
@@ -663,9 +663,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
           Reduce(input_flat, indices_vec, start, end - start, out);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
-                      "Bad: indices[", start + bad_offset, "] == ",
-                      indices_vec(start + bad_offset), " out of range [0, ",
-                      input_flat.dimension(0), ")"));
+                      "Bad: indices[", start + bad_offset,
+                      "] == ", indices_vec(start + bad_offset),
+                      " out of range [0, ", input_flat.dimension(0), ")"));
 
       start = end;
       ++end;
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index bcdd42c80c1..5c9cfe09065 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -51,13 +51,14 @@ struct SegmentSumFunctor {
 // BaseFunctor for definition of UnsorteSegmentReductionOp
 // for usage without templates.
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor{
-  virtual ~UnsortedSegmentBaseFunctor(){}
+struct UnsortedSegmentBaseFunctor {
+  virtual ~UnsortedSegmentBaseFunctor() {}
   virtual void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output){};
+                          const Index output_rows,
+                          const TensorShape& segment_ids_shape,
+                          typename TTypes<Index>::ConstFlat segment_ids,
+                          const Index data_size, const T* data,
+                          typename TTypes<T, 2>::Tensor output){};
 };
 
 // Functor for UnsortedSegmentSumOp.
@@ -70,7 +71,8 @@ struct UnsortedSegmentBaseFunctor{
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentSumFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -88,7 +90,8 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, I
 // data: input data tensor.
 // output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
+struct UnsortedSegmentMaxFunctor
+    : public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 159fada621b..39d520698e1 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -194,7 +194,8 @@ void SegmentSumFunctor<T, Index>::operator()(
 
 // UnsortedSegmentSumFunctor implementation for GPUDevice.
 template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
+struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
+    : UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
                   const Index output_rows, const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
@@ -221,11 +222,10 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
     const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
 
     config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<
-        T,
-        Index><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        input_outer_dim_size, input_inner_dim_size, output_rows,
-        segment_ids.data(), data, output.data());
+    UnsortedSegmentSumCustomKernel<T, Index>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input_outer_dim_size, input_inner_dim_size, output_rows,
+            segment_ids.data(), data, output.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index 97657807268..bcd88773902 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 206fd40fa68..688e61fcadc 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -198,7 +198,7 @@ REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
@@ -207,6 +207,6 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index e2e3758d87e..9db0bd4d98b 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -53,13 +53,13 @@ class RangeOp : public OpKernel {
     if (delta > 0) {
       OP_REQUIRES(
           context, start <= limit,
-          errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start <= limit when delta > 0: ", start, "/", limit));
     } else {
       OP_REQUIRES(
           context, start >= limit,
-          errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start >= limit when delta < 0: ", start, "/", limit));
     }
     int64 size = (std::is_integral<T>::value
                       ? ((std::abs(limit - start) + std::abs(delta) - 1) /
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 61e40caef99..799c574d154 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -426,7 +426,6 @@ class DeserializeSparseOp : public OpKernel {
     switch (dtype_) {
       TF_CALL_ALL_TYPES(HANDLE_TYPE);
       TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-      TF_CALL_variant(HANDLE_TYPE);
 #undef HANDLE_TYPE
       default:
         OP_REQUIRES(context, false,
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 185c5b248fc..f2dd2812b53 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -144,7 +144,7 @@ REGISTER_GPU_KERNEL(bool);
 TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class DeleteSessionTensorOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 8d9d0ea8461..55be308901b 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -235,10 +235,10 @@ class SqueezeOp : public OpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument("Tried to explicitly squeeze "
-                                              "dimension ",
-                                              i, " but dimension was not 1: ",
-                                              existing_dim));
+                      errors::InvalidArgument(
+                          "Tried to explicitly squeeze "
+                          "dimension ",
+                          i, " but dimension was not 1: ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 82595de7794..79369fd4a9c 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -58,7 +58,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
@@ -72,10 +72,11 @@ static void SharedValidation(OpKernelContext* context,
   const Tensor& size_tensor = context->input(2);
 
   OP_REQUIRES(
-      context, context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
-                   context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
-                   begin_tensor.NumElements() == input.dims() &&
-                   size_tensor.NumElements() == input.dims(),
+      context,
+      context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
+          context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
+          begin_tensor.NumElements() == input.dims() &&
+          size_tensor.NumElements() == input.dims(),
       errors::InvalidArgument(
           "Expected begin and size arguments to be 1-D tensors of size ",
           input.dims(), ", but got shapes ", begin_tensor.shape().DebugString(),
@@ -125,8 +126,7 @@ static void SharedSliceCommonCases(OpKernelContext* context,
                                    TensorShape* output_shape,
                                    gtl::InlinedVector<int64, 4>* begin,
                                    gtl::InlinedVector<int64, 4>* size,
-                                   Tensor** result,
-                                   bool* done) {
+                                   Tensor** result, bool* done) {
   bool is_identity = true;
   bool slice_dim0 = true;
   *done = false;
@@ -142,8 +142,8 @@ static void SharedSliceCommonCases(OpKernelContext* context,
     return;
   }
 
-  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
-                                          (*size)[0])) {
+  if (slice_dim0 &&
+      IsDim0SliceAligned<T>(input.shape(), (*begin)[0], (*size)[0])) {
     VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
     CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
     context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
@@ -154,7 +154,6 @@ static void SharedSliceCommonCases(OpKernelContext* context,
   OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
 }
 
-
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -206,8 +205,9 @@ class SliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -280,8 +280,9 @@ class MklSliceOp : public OpKernel {
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -292,9 +293,9 @@ class MklSliceOp : public OpKernel {
   // as the sizes of all the dimensions of the input except slice_dim, then
   // returns True. Otherwise, returns False.
   bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int slice_dim) {
+                                          const gtl::ArraySlice<int64>& begin,
+                                          const gtl::ArraySlice<int64>& size,
+                                          int slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (dim != slice_dim &&
           (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
@@ -316,9 +317,9 @@ class MklSliceOp : public OpKernel {
   // Returns True if Slicing over a single dimension, and sets slice_dim
   // to the number of the dimension that satisfies criteria.
   bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int* slice_dim) {
+                                    const gtl::ArraySlice<int64>& begin,
+                                    const gtl::ArraySlice<int64>& size,
+                                    int* slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
         *slice_dim = dim;
@@ -329,8 +330,7 @@ class MklSliceOp : public OpKernel {
   }
 
   template <int NDIM>
-  void HandleCase(OpKernelContext* context,
-                  const gtl::ArraySlice<int64>& begin,
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
     TensorShape in_shape = context->input(0).shape();
@@ -340,67 +340,63 @@ class MklSliceOp : public OpKernel {
     // format over channel dimension.
     if (NDIM == 4 &&
         DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
-        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
-                                          in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(3),
-                                 (size_t) 1
-                               };
+      size_t in_strides[4] = {
+          (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) *
+              in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(2) * in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(3), (size_t)1};
 
-        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
-                                  (size_t) size[2] * size[3],
-                                  (size_t) size[3],
-                                  (size_t) 1 };
+      size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3],
+                               (size_t)size[2] * size[3], (size_t)size[3],
+                               (size_t)1};
 
-        T *in_buf = const_cast<T*>(const_cast<const T*>(
-                    context->input(0).flat<T>().data()));
-        T *op_buf = result->flat<T>().data();
+      T* in_buf = const_cast<T*>(
+          const_cast<const T*>(context->input(0).flat<T>().data()));
+      T* op_buf = result->flat<T>().data();
 
-        if (slice_dim == 1) {
-          /* data format = NCHW */
+      if (slice_dim == 1) {
+        /* data format = NCHW */
 
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip  = in_buf + (d0 * in_strides[0]);
-              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              // For NCHW, H and W will be contiguous. So we can copy
-              // both with one memcpy.
-              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
-                     sizeof(T) * in_strides[1]);
-            }
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+            // For NCHW, H and W will be contiguous. So we can copy
+            // both with one memcpy.
+            memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
+                   sizeof(T) * in_strides[1]);
           }
-          return;
-        } else if (slice_dim == 3) {
-          /* data_format = NHWC */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip = in_buf + (d0 * in_strides[0]);
-              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              #pragma omp parallel for
-              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
-                T *ip2 = ip1 + (d2 * in_strides[2]);
-                T *ip3 = ip2 + begin[3];
-                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
-                T *op3 = op2;
-                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
-                       sizeof(T) * size[3]);
-              }
-            }
-          }
-          return;
         }
-        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+        return;
+      } else if (slice_dim == 3) {
+        /* data_format = NHWC */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+#pragma omp parallel for
+            for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+              T* ip2 = ip1 + (d2 * in_strides[2]);
+              T* ip3 = ip2 + begin[3];
+              T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
+              T* op3 = op2;
+              memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
+                     sizeof(T) * size[3]);
+            }
+          }
+        }
+        return;
+      }
+      // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
     }
 
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
@@ -535,13 +531,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                 \
-  template <>                                                      \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                     \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output,\
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                                  \
+  template <>                                                       \
+  void Slice<SYCLDevice, T, NDIM>::operator()(                      \
+      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                  \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,        \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);         \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index 0362a021336..db7eded745e 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -24,7 +24,6 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-
 template <typename Device, typename T, int NDIMS>
 struct Slice {
   void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index 47f1d5342a9..64b6948190a 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -43,7 +43,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS);
 DEFINE_SYCL_KERNELS(int32);
 
 #undef DEFINE_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 590f01c4691..e1712ac239d 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
@@ -48,7 +48,7 @@ struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -100,5 +100,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
     SoftmaxOp<SYCLDevice, double>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index c25ce2d8bb5..92ddf8edbfb 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -70,7 +70,7 @@ static Graph* ConstructSpaceToBatchGraph(
   }                                                                                                     \
   BENCHMARK(                                                                                            \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
-#define BM_SpaceToBatch(OP, ...)                      \
+#define BM_SpaceToBatch(OP, ...)                                 \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__));  \
diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc
index 23d8a5f9ed4..4c374b8d994 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor.cc
@@ -154,7 +154,7 @@ struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index 06813650c08..f46a84da1e9 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -44,7 +44,7 @@ constexpr int kMaxSpaceToBatchBlockDims = 4;
   MACRO(2 /**/, ##__VA_ARGS__)                              \
   MACRO(3 /**/, ##__VA_ARGS__)                              \
   MACRO(4 /**/, ##__VA_ARGS__)                              \
-/**/
+  /**/
 
 namespace internal {
 namespace spacetobatch {
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index db8d419c38f..5687141c9ea 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -141,10 +141,10 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
     }
     CudaLaunchConfig config =
         GetCudaLaunchConfig(static_cast<int32>(total_count), d);
-    S2B<T, NUM_BLOCK_DIMS,
-        B2S><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, const_cast<T*>(space_tensor.data()), args,
-        const_cast<T*>(batch_tensor.data()));
+    S2B<T, NUM_BLOCK_DIMS, B2S>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, const_cast<T*>(space_tensor.data()),
+            args, const_cast<T*>(batch_tensor.data()));
     return Status::OK();
   }
 };
@@ -153,7 +153,7 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 95c1f5e7e8c..fdc08ec8e3b 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -58,9 +58,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
-                           block_dims == orig_paddings.dim_size(0) &&
-                           2 == orig_paddings.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
+                  block_dims == orig_paddings.dim_size(0) &&
+                  2 == orig_paddings.dim_size(1),
               errors::InvalidArgument("paddings should have shape [",
                                       block_dims, ", 2] instead of ",
                                       orig_paddings.shape().DebugString()));
diff --git a/tensorflow/core/kernels/sparse_add_grad_op.cc b/tensorflow/core/kernels/sparse_add_grad_op.cc
index d8ed0c6f0c2..8597f3a8f73 100644
--- a/tensorflow/core/kernels/sparse_add_grad_op.cc
+++ b/tensorflow/core/kernels/sparse_add_grad_op.cc
@@ -35,9 +35,10 @@ class SparseAddGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
     OP_REQUIRES_OK(ctx, ctx->input("sum_indices", &sum_indices));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(sum_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(sum_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -49,8 +50,9 @@ class SparseAddGradOp : public OpKernel {
             "Input backprop_val_grad should be a vector but received shape: ",
             backprop_val_grad->shape().DebugString()));
     OP_REQUIRES(
-        ctx, a_indices->dim_size(1) == b_indices->dim_size(1) &&
-                 b_indices->dim_size(1) == sum_indices->dim_size(1),
+        ctx,
+        a_indices->dim_size(1) == b_indices->dim_size(1) &&
+            b_indices->dim_size(1) == sum_indices->dim_size(1),
         errors::InvalidArgument("The densified operands should have the same "
                                 "ndims; for A, B, sum got: ",
                                 a_indices->dim_size(1), b_indices->dim_size(1),
diff --git a/tensorflow/core/kernels/sparse_add_op.cc b/tensorflow/core/kernels/sparse_add_op.cc
index bd91dfdce64..d16317af671 100644
--- a/tensorflow/core/kernels/sparse_add_op.cc
+++ b/tensorflow/core/kernels/sparse_add_op.cc
@@ -34,8 +34,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -46,8 +47,9 @@ class SparseAddOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t));
     OP_REQUIRES_OK(ctx, ctx->input("b_values", &b_values_t));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Input values should be vectors but received shapes: ",
                     a_values_t->shape().DebugString(), " and ",
@@ -62,8 +64,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
     OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 4cad02bbee8..1f08e6c5ce2 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -61,9 +61,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
index c122616cf15..80bc1f19344 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
@@ -103,8 +103,9 @@ class SparseAccumulatorTakeGradientOp
                       DoneCallback callback) override {
     // Check signature
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                 {DT_INT64, accumulator->dtype(), DT_INT64}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+                            {DT_INT64, accumulator->dtype(), DT_INT64}),
         callback);
   }
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 07d935d55fe..7cd4532ad63 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -288,8 +288,7 @@ struct CrossTraits<true, int64> {
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
-  explicit SparseCrossOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit SparseCrossOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
     // Read signed_hash_key_ as int64 since uint64 attributes are not
     // supported by REGISTER_OP.
@@ -316,8 +315,8 @@ class SparseCrossOp : public OpKernel {
         GenerateColumnsFromInput(indices_list_in, values_list_in,
                                  shapes_list_in, dense_list_in);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
-        crosser(columns, num_buckets_, hash_key_);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
+        columns, num_buckets_, hash_key_);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -326,8 +325,8 @@ class SparseCrossOp : public OpKernel {
     CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
                         &shape_out, &output_start_indices);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
-        updater(output_start_indices, indices_out, values_out);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(
+        output_start_indices, indices_out, values_out);
     auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
       for (int b = begin; b < end; b++) {
         ProductIterator<InternalType> product_iterator(columns, b);
@@ -381,8 +380,9 @@ class SparseCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index cc0f86ce05e..ac48202ada2 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -70,8 +70,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values_t->shape()) &&
-                         TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
@@ -150,8 +151,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
       CASE(4);
       CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef CASE
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index eaf1884243e..fe198af7e6c 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -96,9 +96,9 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [2    ]  cdiv [dense: same shape, all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -125,9 +125,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [2    ]  cdiv [dense: shape [3,1], all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -152,9 +152,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [2    ]  cdiv [dense: shape [2]]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
@@ -184,9 +184,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1   ?]  where ? remains implicitly zero.
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 8ab23b64d3d..a1f9667b783 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -159,8 +159,8 @@ struct SparseSlice {
 
 template <typename T>
 template <bool Transpose>
-void SparseSlice<T>::Initialize(const typename SparseSlice<T>::ConstMatrixMap& mat,
-                                int col_offset) {
+void SparseSlice<T>::Initialize(
+    const typename SparseSlice<T>::ConstMatrixMap& mat, int col_offset) {
   const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
   const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
   DCHECK_LE(num_rows, mat_rows);
@@ -278,9 +278,9 @@ ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) {
   float out = 0;
   auto tmp = reinterpret_cast<bfloat16*>(&out);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    tmp[0] = *src;
+  tmp[0] = *src;
 #else
-    tmp[1] = *src;
+  tmp[1] = *src;
 #endif
   return out;
 }
@@ -970,9 +970,9 @@ class SparseMatMulOp : public OpKernel {
     const int k2 = transpose_b_ ? b.dim_size(1) : b.dim_size(0);
 
     OP_REQUIRES(ctx, k == k2,
-                errors::InvalidArgument("Matrix size incompatible: a: ",
-                                        a.shape().DebugString(), ", b: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size incompatible: a: ", a.shape().DebugString(),
+                    ", b: ", b.shape().DebugString()));
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
 
@@ -1224,8 +1224,9 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src,
 
 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
-    int slice_num_rows, int slice_col_start, int slice_num_cols, const int N,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat,
+    int slice_row_start, int slice_num_rows, int slice_col_start,
+    int slice_num_cols, const int N,
     const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) {
   DCHECK_EQ(N % 2, 0);
   DCHECK_LE(kNumOperands * sizeof(float) / sizeof(TR), N);
@@ -1306,8 +1307,9 @@ inline std::unique_ptr<BlockingCounter> SparseMatMul<TL, TR>::CreateDenseSlices(
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
     const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
-    int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) {
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right,
+    bool transpose_left, int num_threads, int* KR, int* NR, int* KL, int* JB,
+    int* IB) {
   // Heuristics for calculating block sizes
   // Assume two hyperthreads per core.
   const int est_num_cores = std::max(1, (num_threads + 1) / 2);
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index cca52558ae2..14ef2ed7044 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -159,25 +159,25 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
 // Return a packet with the first value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 0);
+  return vec_splat(a, 0);
 }
 
 // Return a packet with the second value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 1);
+  return vec_splat(a, 1);
 }
 
 // Return a packet with the third value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 2);
+  return vec_splat(a, 2);
 }
 
 // Return a packet with the fourth value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 3);
+  return vec_splat(a, 3);
 }
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index f815ca9e344..ebc6d8fa4ec 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -284,11 +284,11 @@ class SparseMatmulOpTest : public ::testing::Test {
       uint16_t* data3_bfloat16_p =
           reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            data3_p[1] = 0;
-            data3_bfloat16_p[0] = data3_p[0];
+      data3_p[1] = 0;
+      data3_bfloat16_p[0] = data3_p[0];
 #else
-            data3_p[0] = 0;
-            data3_bfloat16_p[0] = data3_p[1];
+      data3_p[0] = 0;
+      data3_bfloat16_p[0] = data3_p[1];
 #endif
     }
   }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 110376be425..96246c7a712 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -51,9 +51,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
@@ -93,9 +93,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc
index 327a94b8a12..444a5f657a9 100644
--- a/tensorflow/core/kernels/sparse_softmax_op.cc
+++ b/tensorflow/core/kernels/sparse_softmax_op.cc
@@ -50,8 +50,9 @@ class SparseSoftmaxOp : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()) &&
-                             TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
index b027adba6b3..09cb2a6a71c 100644
--- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
@@ -132,14 +132,16 @@ class SparseSparseBinaryOpShared : public OpKernel {
 
     // Validations.
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
-                 TensorShapeUtils::IsMatrix(b_indices_t->shape()),
+        ctx,
+        TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
+            TensorShapeUtils::IsMatrix(b_indices_t->shape()),
         errors::InvalidArgument("Inputs a_indices and b_indices should be "
                                 "matrices but received shapes: ",
                                 a_indices_t->shape().DebugString(), ", ",
                                 b_indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Inputs a_values and b_values should be vectors "
                     "but received shapes: ",
@@ -157,8 +159,9 @@ class SparseSparseBinaryOpShared : public OpKernel {
                                 " non-empty input values, got ",
                                 a_values.size(), " and ", b_values.size()));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape_t->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape_t->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc
index 6171b532aa2..67dcf05a6ce 100644
--- a/tensorflow/core/kernels/sparse_split_op.cc
+++ b/tensorflow/core/kernels/sparse_split_op.cc
@@ -48,18 +48,20 @@ class SparseSplitOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape.shape().DebugString()));
 
-    OP_REQUIRES(context, input_shape.dim_size(0) &&
-                             split_dim < input_shape.vec<int64>().size(),
-                errors::InvalidArgument(
-                    "Input split_dim should be between 0 and rank (",
-                    input_shape.vec<int64>().size(), "), got ", split_dim));
+    OP_REQUIRES(
+        context,
+        input_shape.dim_size(0) && split_dim < input_shape.vec<int64>().size(),
+        errors::InvalidArgument(
+            "Input split_dim should be between 0 and rank (",
+            input_shape.vec<int64>().size(), "), got ", split_dim));
 
-    OP_REQUIRES(context, num_split_ >= 1 &&
-                             num_split_ <= input_shape.vec<int64>()(split_dim),
-                errors::InvalidArgument("Input num_split should be between 1 "
-                                        "and the splitting dimension size (",
-                                        input_shape.vec<int64>()(split_dim),
-                                        "), got ", num_split_));
+    OP_REQUIRES(
+        context,
+        num_split_ >= 1 && num_split_ <= input_shape.vec<int64>()(split_dim),
+        errors::InvalidArgument("Input num_split should be between 1 "
+                                "and the splitting dimension size (",
+                                input_shape.vec<int64>()(split_dim), "), got ",
+                                num_split_));
 
     sparse::SparseTensor sparse_tensor(input_indices, input_values,
                                        TensorShape(input_shape.vec<int64>()));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index 6a6cc3d8138..ba3da21a433 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -73,8 +73,9 @@ class SparseToDense : public OpKernel {
     // sparse_values
     const Tensor& sparse_values = c->input(2);
     const int64 num_values = sparse_values.NumElements();
-    OP_REQUIRES(c, sparse_values.dims() == 0 ||
-                       (sparse_values.dims() == 1 && num_values == num_elems),
+    OP_REQUIRES(c,
+                sparse_values.dims() == 0 ||
+                    (sparse_values.dims() == 1 && num_values == num_elems),
                 errors::InvalidArgument("sparse_values has incorrect shape ",
                                         sparse_values.shape().DebugString(),
                                         ", should be [] or [", num_elems, "]"));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index f0d19da8046..d8b0f930824 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -38,7 +38,6 @@ namespace {
 
 class SparseToDenseTest : public OpsTestBase {
  protected:
-
   void MakeOp(int dim, DataType index_type, DataType value_type) {
     TF_ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
                      .Input(FakeInput(index_type))
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index c35ba42db29..f84ffd53238 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -39,10 +39,10 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) {
   if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) {
     bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first
                                                : *min_max_dim_value.second;
-    return errors::InvalidArgument("Received a label value of ", bad_index,
-                                   " which is outside the valid range of [0, ",
-                                   max_index, ").  Label values: ",
-                                   labels.SummarizeValue(labels.NumElements()));
+    return errors::InvalidArgument(
+        "Received a label value of ", bad_index,
+        " which is outside the valid range of [0, ", max_index,
+        ").  Label values: ", labels.SummarizeValue(labels.NumElements()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc
index b8ea0d2d7e2..afb0bf76267 100644
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@@ -41,10 +41,10 @@ static Graph* SparseXent(int batch_size, int num_classes) {
   return g;
 }
 
-#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                                \
-  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
+#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                          \
+  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
     testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);            \
+    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);      \
   }                                                                     \
   BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
 
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index ff92ffeeb38..a08949e626c 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -57,7 +57,7 @@ struct Split<Eigen::SyclDevice, T> {
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 25026208d1e..771c633b156 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -49,13 +49,13 @@ void Split<Eigen::SyclDevice, T>::operator()(
     typename TTypes<T, 3>::ConstTensor input,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
-    output.device(d) = input.slice(slice_indices, slice_sizes);
+  output.device(d) = input.slice(slice_indices, slice_sizes);
 }
 
 #define DEFINE_SYCL_KERNELS(T) template struct Split<Eigen::SyclDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 78badde27e5..85f529326db 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SplitOpBase : public OpKernel {
@@ -142,8 +142,9 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -245,10 +246,11 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
     const int32 num_split = Base::num_outputs();
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
     int32 prefix_dim_size;
     int32 split_dim_size;
     int32 suffix_dim_size;
@@ -304,8 +306,9 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -342,14 +345,14 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
             {prefix_dim_size, split_dim_output_size, suffix_dim_size});
 
         functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
-                                       result_shaped, input_reshaped,
-                                       slice_indices, slice_sizes);
+                                        result_shaped, input_reshaped,
+                                        slice_indices, slice_sizes);
       }
       indices[1] += split_dim_output_size;
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -381,11 +384,11 @@ REGISTER_GPU(bfloat16);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Split")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("split_dim"),   \
+#define REGISTER_SYCL(type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Split")                  \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("split_dim"),  \
                           SplitOpSYCL<type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index f1078ac349c..7ff5df47d70 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -197,8 +197,9 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -305,10 +306,11 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     const int32 split_dim_orig = context->input(2).flat<int32>()(0);
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
 
     int32 prefix_dim_size;
     int32 split_dim_size;
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index affe81a5556..65296f61fd1 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -42,7 +42,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -242,7 +242,7 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                             .HostMemory("max_size")
                             .HostMemory("handle"),
                         StackOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -274,11 +274,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        ( std::is_same<Device, GPUDevice>::value
+        (std::is_same<Device, GPUDevice>::value
 #ifdef TENSORFLOW_USE_SYCL
-          || std::is_same<Device, SYCLDevice>::value
-#endif // TENSORFLOW_USE_SYCL
-        ) &&
+         || std::is_same<Device, SYCLDevice>::value
+#endif  // TENSORFLOW_USE_SYCL
+         ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -391,7 +391,7 @@ REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackPopOp : public AsyncOpKernel {
  public:
@@ -498,7 +498,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackCloseOp : public OpKernel {
  public:
@@ -526,6 +526,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"),
     StackCloseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 0fae46dea61..03fc4467a1d 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -70,12 +70,11 @@ class Buffer : public ResourceBase {
     return bytes + current_bytes_ > memory_limit_;
   }
 
-  std::size_t GetTupleBytes(const Tuple & tuple)
-  {
+  std::size_t GetTupleBytes(const Tuple& tuple) {
     return std::accumulate(tuple.begin(), tuple.end(), 0,
-      [](const std::size_t & lhs, const Tensor & rhs) {
-        return lhs + rhs.TotalBytes();
-    });
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
   }
 
  public:
@@ -90,19 +89,22 @@ class Buffer : public ResourceBase {
     std::size_t tuple_bytes = GetTupleBytes(*tuple);
 
     // Sanity check so that we don't block for ever below
-    if(memory_limit_ > 0 && tuple_bytes > memory_limit_) {
-      return Status(errors::ResourceExhausted("Attempted to insert "
-        "tensors with combined size of '", tuple_bytes, "' bytes into "
-        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    if (memory_limit_ > 0 && tuple_bytes > memory_limit_) {
+      return Status(
+          errors::ResourceExhausted("Attempted to insert "
+                                    "tensors with combined size of '",
+                                    tuple_bytes,
+                                    "' bytes into "
+                                    "Staging Area with a memory limit of '",
+                                    memory_limit_, "'."));
     }
 
-
     // If buffer capacity is bounded wait until elements have been removed
-    if(IsBounded()) {
+    if (IsBounded()) {
       full_cond_var_.wait(lock, [tuple_bytes, this]() {
         // If there's a memory limit, check if there's space for insertion
-        bool memory_limit_valid = memory_limit_ > 0 ?
-            !WouldExceedMemoryLimit(tuple_bytes) : true;
+        bool memory_limit_valid =
+            memory_limit_ > 0 ? !WouldExceedMemoryLimit(tuple_bytes) : true;
         // If we're configured for capacity check if there's space for insertion
         bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
 
@@ -186,8 +188,7 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
   ContainerInfo cinfo;
 
   // Lambda for creating the Staging Area
-  auto create_fn = [&ndef](Buffer** ret) -> Status
-  {
+  auto create_fn = [&ndef](Buffer** ret) -> Status {
     int64 capacity;
     int64 memory_limit;
     TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
@@ -196,7 +197,6 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
     return Status::OK();
   };
 
-
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
                                                 buf, create_fn));
@@ -228,7 +228,7 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -244,7 +244,8 @@ class UnstageOp : public OpKernel {
 
     buf->Get(&tuple);
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -260,7 +261,7 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StagePeekOp : public OpKernel {
  public:
@@ -278,7 +279,8 @@ class StagePeekOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple));
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -288,17 +290,15 @@ class StagePeekOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU),
-                                              StagePeekOp);
+REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index").
-                            Device(DEVICE_GPU), StagePeekOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index")
-                          .Device(DEVICE_SYCL), StagePeekOp);
-#endif // TENSORFLOW_USE_SYCL
-
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageSizeOp : public OpKernel {
  public:
@@ -312,9 +312,8 @@ class StageSizeOp : public OpKernel {
     core::ScopedUnref scope(buf);
 
     // Allocate size output tensor
-    Tensor * size = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
-                                                     &size));
+    Tensor* size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
     size->scalar<int32>().setConstant(buf->Size());
@@ -323,13 +322,13 @@ class StageSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_GPU), StageSizeOp);
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
+                        StageSizeOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_SYCL), StageSizeOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageClearOp : public OpKernel {
  public:
@@ -352,7 +351,6 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
-#endif // TENSORFLOW_USE_SYCL
-
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7c213e14d21..7745effe2ab 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -294,6 +294,11 @@ class StridedSliceAssignOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
       old_lhs = *v->tensor();
+      OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum<T>::value,
+                  errors::InvalidArgument(
+                      "l-value dtype ", DataTypeString(old_lhs.dtype()),
+                      " does not match r-value dtype ",
+                      DataTypeString(DataTypeToEnum<T>::value)));
     } else {
       context->forward_ref_input_to_ref_output(0, 0);
       old_lhs = context->mutable_input(0, true);
@@ -541,5 +546,5 @@ REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.h b/tensorflow/core/kernels/strided_slice_op.h
index 0f72c4b7710..2b586322986 100644
--- a/tensorflow/core/kernels/strided_slice_op.h
+++ b/tensorflow/core/kernels/strided_slice_op.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index a84ba38ef41..1c4472bb1ab 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -302,7 +304,7 @@ DECLARE_FOR_N_SYCL(int32);
 DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc
index 721702bec68..28cca9f4484 100644
--- a/tensorflow/core/kernels/string_join_op.cc
+++ b/tensorflow/core/kernels/string_join_op.cc
@@ -50,9 +50,9 @@ class StringJoinOp : public OpKernel {
         } else {
           OP_REQUIRES(
               context, input_shape == input.shape(),
-              errors::InvalidArgument("Input shapes do not match: ",
-                                      input_shape.DebugString(), " vs. ",
-                                      input.shape().DebugString()));
+              errors::InvalidArgument(
+                  "Input shapes do not match: ", input_shape.DebugString(),
+                  " vs. ", input.shape().DebugString()));
         }
       }
     }
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 743f1131504..e29f67297f9 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -95,9 +95,9 @@ class SubstrOp : public OpKernel {
       // Create BCast helper with shape of input and pos/len
       BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(pos_shape));
       OP_REQUIRES(context, bcast.IsValid(),
-                  errors::InvalidArgument("Incompatible shapes: ",
-                                          input_shape.DebugString(), " vs. ",
-                                          pos_shape.DebugString()));
+                  errors::InvalidArgument(
+                      "Incompatible shapes: ", input_shape.DebugString(),
+                      " vs. ", pos_shape.DebugString()));
       TensorShape output_shape = BCast::ToShape(bcast.result_shape());
       int ndims = output_shape.dims();
       Tensor* output_tensor = nullptr;
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 233b824bcc3..29b21ee7353 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -54,18 +54,20 @@ class SummaryImageOp : public OpKernel {
     const Tensor& tensor = c->input(1);
     OP_REQUIRES(c, IsLegacyScalar(tags.shape()),
                 errors::InvalidArgument("Tags must be a scalar"));
-    OP_REQUIRES(c, tensor.dims() == 4 &&
-                       (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
-                        tensor.dim_size(3) == 4),
+    OP_REQUIRES(c,
+                tensor.dims() == 4 &&
+                    (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+                     tensor.dim_size(3) == 4),
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
     const string& base_tag = tags.scalar<string>()();
 
-    OP_REQUIRES(c, tensor.dim_size(0) < (1LL << 31) &&
-                       tensor.dim_size(1) < (1LL << 31) &&
-                       tensor.dim_size(2) < (1LL << 31) &&
-                       (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
+    OP_REQUIRES(c,
+                tensor.dim_size(0) < (1LL << 31) &&
+                    tensor.dim_size(1) < (1LL << 31) &&
+                    tensor.dim_size(2) < (1LL << 31) &&
+                    (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
                 errors::InvalidArgument("Tensor too large for summary ",
                                         tensor.shape().DebugString()));
 
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index b818724ec2e..1f4e3418f48 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -41,11 +41,12 @@ class SummaryScalarOp : public OpKernel {
     const Tensor& values = c->input(1);
 
     OP_REQUIRES(
-        c, tags.IsSameSize(values) ||
-               (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
-        errors::InvalidArgument("tags and values not the same shape: ",
-                                tags.shape().DebugString(), " != ",
-                                values.shape().DebugString(), SingleTag(tags)));
+        c,
+        tags.IsSameSize(values) ||
+            (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
+        errors::InvalidArgument(
+            "tags and values not the same shape: ", tags.shape().DebugString(),
+            " != ", values.shape().DebugString(), SingleTag(tags)));
     auto Ttags = tags.flat<string>();
     auto Tvalues = values.flat<T>();
     Summary s;
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index b2fd669541d..f8144867014 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/tile_functor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/tile_functor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 054b31ef9e0..df6a666cd44 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -63,7 +63,7 @@ TF_CALL_int64(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 38e77ab60fb..07befa27bc5 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -3279,7 +3279,6 @@ REGISTER_KERNELS(double, int64);
 
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyAddSignOp : public OpKernel {
  public:
@@ -3362,17 +3361,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyAddSign<GPUDevice, T>::operator()(                            \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar alpha,                              \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyAddSign<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar alpha,                          \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyAddSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -3387,7 +3384,6 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyPowerSignOp : public OpKernel {
  public:
@@ -3470,17 +3466,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyPowerSign<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar logbase,                            \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyPowerSign<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar logbase,                        \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyPowerSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index d443a6b3c1d..0376a3b2c60 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_ops.h"
 
 namespace tensorflow {
 
@@ -115,13 +115,11 @@ struct ApplyAdam<GPUDevice, T> {
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
     m.device(d) =
-        m +
-        (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-            (grad - m);
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
     v.device(d) =
-        v +
-        (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
-            (grad.square() - v);
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
 
     if (use_nesterov) {
       var.device(d) -=
@@ -157,9 +155,9 @@ struct ApplyRMSProp<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
-    ms.device(d) = ms +
-                   (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
-                       (grad.square() - ms);
+    ms.device(d) =
+        ms + (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
+                 (grad.square() - ms);
     mom.device(d) =
         mom * momentum.reshape(single).broadcast(bcast) +
         lr.reshape(single).broadcast(bcast) * grad /
@@ -212,7 +210,7 @@ struct ApplyAddSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
@@ -244,7 +242,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
@@ -253,7 +251,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto lr_bcast = lr.reshape(single).broadcast(bcast);
     auto logbase_bcast = logbase.reshape(single).broadcast(bcast);
     auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
-    auto grad_scale =  (logbase_bcast * sign_decay_bcast * sign_gm).exp();
+    auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp();
     var.device(d) -= lr_bcast * grad_scale * grad;
   }
 };
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index ffa7f87c9ef..2dcc4a500e6 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -176,8 +176,9 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) {
     auto beta2 = Scalar(g, 0.99);
     auto epsilon = Scalar(g, 1e-8);
     auto grad = Random(g, n);
-    test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr,
-                                        beta1, beta2, epsilon, grad});
+    test::graph::Multi(
+        g, "ApplyAdam",
+        {var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad});
     *train_g = g;
   }
 }
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 2e0d18b634a..7177ad78884 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -176,9 +176,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
     }
   }
   for (int i = 0; i < dims; ++i) {
-    OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
-                                  i, " is missing from {",
-                                  str_util::Join(permutation, ","), "}."));
+    OP_REQUIRES(
+        ctx, bits[i],
+        errors::InvalidArgument(i, " is missing from {",
+                                str_util::Join(permutation, ","), "}."));
   }
 
   // 0-D, 1-D, and identity transposes do nothing.
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index 0d608d9b879..43dcb4cef74 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -58,9 +58,9 @@ Status TypedQueue<SubQueue>::Initialize() {
   if (!component_shapes_.empty() &&
       component_dtypes_.size() != component_shapes_.size()) {
     return errors::InvalidArgument(
-        "Different number of component types.  ", "Types: ",
-        DataTypeSliceString(component_dtypes_), ", Shapes: ",
-        ShapeListString(component_shapes_));
+        "Different number of component types.  ",
+        "Types: ", DataTypeSliceString(component_dtypes_),
+        ", Shapes: ", ShapeListString(component_shapes_));
   }
 
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 397bdd56708..764b6a252ad 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -34,7 +34,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class UnpackOp : public OpKernel {
@@ -65,8 +65,9 @@ class UnpackOp : public OpKernel {
     output_shape.RemoveDim(axis);
     const int64 output_size = output_shape.num_elements();
     OP_REQUIRES(
-        context, FastBoundsCheck(output_size,
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(output_size,
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
 // This optimization is currently not applicable for SYCL devices
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
new file mode 100644
index 00000000000..da9ab01e8db
--- /dev/null
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+template <typename T>
+struct mod_op {
+  const T operator()(const T& a, const T& b) const { return a % b; }
+};
+}  // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Tidx>
+class UnravelIndexOp : public OpKernel {
+ public:
+  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& indices_tensor = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices_tensor.shape()) ||
+                         TensorShapeUtils::IsScalar(indices_tensor.shape()),
+                errors::InvalidArgument(
+                    "The indices can only be scalar or vector, got \"",
+                    indices_tensor.shape().DebugString(), "\""));
+
+    const Tensor& dims_tensor = ctx->input(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(dims_tensor.shape()),
+        errors::InvalidArgument("The indices can only be 1-D, got \"",
+                                dims_tensor.shape().DebugString(), "\""));
+
+    auto dims = dims_tensor.vec<Tidx>();
+
+    Eigen::array<bool, 1> reverse({true});
+
+    Tensor strides_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_tensor));
+
+    auto strides = strides_tensor.vec<Tidx>();
+    strides = dims.reverse(reverse)
+                  .scan(0, Eigen::internal::ProdReducer<Tidx>(), false)
+                  .reverse(reverse);
+
+    Tensor strides_shifted_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_shifted_tensor));
+
+    auto strides_shifted = strides_shifted_tensor.vec<Tidx>();
+    strides_shifted = dims.reverse(reverse)
+                          .scan(0, Eigen::internal::ProdReducer<Tidx>(), true)
+                          .reverse(reverse);
+
+    Tensor* output_tensor = nullptr;
+    if (TensorShapeUtils::IsScalar(indices_tensor.shape())) {
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}),
+                                    &output_tensor));
+
+      auto output = output_tensor->vec<Tidx>();
+
+      output = output.constant(indices_tensor.scalar<Tidx>()());
+      output = output.binaryExpr(strides, mod_op<Tidx>()) / strides_shifted;
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                              0, TensorShape({dims_tensor.NumElements(),
+                                              indices_tensor.NumElements()}),
+                              &output_tensor));
+
+      auto output = output_tensor->matrix<Tidx>();
+
+      Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}};
+      Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()});
+      Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
+      Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
+
+      output = indices_tensor.vec<Tidx>()
+                   .reshape(indices_reshape)
+                   .broadcast(indices_bcast);
+      output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast),
+                                 mod_op<Tidx>()) /
+               strides_shifted.reshape(reshape).broadcast(bcast);
+    }
+  }
+};
+
+#define REGISTER_KERNEL(type)                                               \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint<type>("Tidx"), \
+      UnravelIndexOp<type>);
+TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL)
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 2d05d72bff1..3477445197a 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -188,9 +188,9 @@ class SkipgramOp : public OpKernel {
       ++corpus_size_;
     }
     if (corpus_size_ < window_size_ * 10) {
-      return errors::InvalidArgument("The text file ", filename,
-                                     " contains too little data: ",
-                                     corpus_size_, " words");
+      return errors::InvalidArgument(
+          "The text file ", filename,
+          " contains too little data: ", corpus_size_, " words");
     }
     typedef std::pair<string, int32> WordFreq;
     std::vector<WordFreq> ordered;
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 0f8d027caad..a6a71fdfaf1 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -44,8 +44,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
     OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
                 errors::InvalidArgument(
                     "logits and labels must be same size: logits_size=",
-                    logits_in.shape().DebugString(), " labels_size=",
-                    labels_in.shape().DebugString()));
+                    logits_in.shape().DebugString(),
+                    " labels_size=", labels_in.shape().DebugString()));
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     // As we already tested that both inputs have the same shape no need to
@@ -72,7 +72,7 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
       functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
               labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
               back_out->matrix<T>());
-      }
+    }
   }
 };
 
@@ -87,7 +87,7 @@ struct XentFunctorBase {
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
     XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
-                                         backprop);
+                                      backprop);
   }
 };
 
@@ -97,7 +97,7 @@ struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -129,6 +129,6 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<float>("T"),
                         SoftmaxXentWithLogitsOp<SYCLDevice, float>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index e2947012467..481f3b7ba46 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
 #include "include/libxsmm.h"
 #include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-
 typedef struct {
   int nImg;
   int nIfm;
@@ -49,45 +48,41 @@ typedef struct {
   int stride_w;
 } naive_conv_t;
 
-
-LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, nchw, C, H, W);
+LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc,
+                                            int N, int H, int W, int C) {
+  LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W);
   int n, h, w, c;
-  auto output =  nhwc.flat<float>();
-  for ( n = 0; n < N; n++ ) {
-    for ( h = 0; h < H; h++ ) {
-      for ( w = 0; w < W; w++ ) {
-        for ( c = 0; c < C; c++ ) {
-          output(n*H*W*C + h*W*C +w*C + c)  =
-          LIBXSMM_VLA_ACCESS(4,  input, n, c, h, w, C, H, W);
+  auto output = nhwc.flat<float>();
+  for (n = 0; n < N; n++) {
+    for (h = 0; h < H; h++) {
+      for (w = 0; w < W; w++) {
+        for (c = 0; c < C; c++) {
+          output(n * H * W * C + h * W * C + w * C + c) =
+              LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W);
         }
       }
     }
   }
 }
 
-
-LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, int R, int S, int C, int K)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
+LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck,
+                                            int R, int S, int C, int K) {
+  LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S);
   int r, s, c, k;
-  auto output =  rsck.flat<float>();
+  auto output = rsck.flat<float>();
 
-  for ( r = 0; r < R; r++ ) {
-    for ( s = 0; s < S; s++ ) {
-      for ( c = 0; c < C; c++ ) {
-        for ( k = 0; k < K; k++ ) {
-          output(r*S*C*K + s*C*K + c*K + k) =
-          LIBXSMM_VLA_ACCESS(4,  input, k, c, r, s, C, R, S);
+  for (r = 0; r < R; r++) {
+    for (s = 0; s < S; s++) {
+      for (c = 0; c < C; c++) {
+        for (k = 0; k < K; k++) {
+          output(r * S * C * K + s * C * K + c * K + k) =
+              LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S);
         }
       }
     }
   }
 }
 
-
-
 LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   int i;
   for (i = 0; i < size; ++i) {
@@ -95,52 +90,53 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   }
 }
 
-LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
-  long  i;
-  auto output =  dst.flat<float>();
-  for (i = 0; i < size; ++i)
-          output(i) = src[i];
+LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) {
+  long i;
+  auto output = dst.flat<float>();
+  for (i = 0; i < size; ++i) output(i) = src[i];
 }
 
-LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
-{
+LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) {
   int i;
   zero_buf(buf, size);
   for (i = 0; i < size; ++i) {
-    buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
+    buf[i] =
+        (float)((initOne != 0)
+                    ? 1.0
+                    : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0)));
   }
 }
 
-
-
-LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
-{
-  int nImg      = param->nImg;
-  int nIfm      = param->nIfm;
-  int nOfm      = param->nOfm;
-  int ifhp      = param->ifhp;
-  int ifwp      = param->ifwp;
-  int ofhp      = param->ofhp;
-  int ofwp      = param->ofwp;
-  int ifh       = param->ifh;
-  int ifw       = param->ifw;
-  int ofh       = param->ofh;
-  int ofw       = param->ofw;
-  int pad_h     = param->pad_h;
-  int pad_w     = param->pad_w;
-  int pad_h_in  = param->pad_h_in;
-  int pad_w_in  = param->pad_w_in;
+LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input,
+                                  float* output, const float* filter) {
+  int nImg = param->nImg;
+  int nIfm = param->nIfm;
+  int nOfm = param->nOfm;
+  int ifhp = param->ifhp;
+  int ifwp = param->ifwp;
+  int ofhp = param->ofhp;
+  int ofwp = param->ofwp;
+  int ifh = param->ifh;
+  int ifw = param->ifw;
+  int ofh = param->ofh;
+  int ofw = param->ofw;
+  int pad_h = param->pad_h;
+  int pad_w = param->pad_w;
+  int pad_h_in = param->pad_h_in;
+  int pad_w_in = param->pad_w_in;
   int pad_h_out = param->pad_h_out;
   int pad_w_out = param->pad_w_out;
-  int kh        = param->kh;
-  int kw        = param->kw;
-  int stride_h  = param->stride_h;
-  int stride_w  = param->stride_w;
+  int kh = param->kh;
+  int kw = param->kw;
+  int stride_h = param->stride_h;
+  int stride_w = param->stride_w;
   /* loop counters */
   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
 
-  LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
-  LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
+  LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out),
+                   nOfm, ofhp, ofwp);
+  LIBXSMM_VLA_DECL(4, const float, input_t,
+                   input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
 
   for (img = 0; img < nImg; ++img) {
@@ -151,12 +147,15 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
           for (oi = 0; oi < ofw; ++oi) {
             ii = oi * stride_w - pad_w;
             for (kj = 0; kj < kh; ++kj) {
-              if(ij+kj < 0 || ij+kj >= ifh) continue;
+              if (ij + kj < 0 || ij + kj >= ifh) continue;
               for (ki = 0; ki < kw; ++ki) {
-                if(ii+ki < 0 || ii+ki >= ifw) continue;
-                LIBXSMM_VLA_ACCESS(  4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) +=
-                  LIBXSMM_VLA_ACCESS(4,  input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)
-                * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw);
+                if (ii + ki < 0 || ii + ki >= ifw) continue;
+                LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp,
+                                   ofwp) +=
+                    LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki,
+                                       nIfm, ifhp, ifwp) *
+                    LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh,
+                                       kw);
               }
             }
           }
@@ -168,134 +167,118 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
 
 void RunXsmmVsGeneric() {}
 
-
 class XsmmConv2DTest : public OpsTestBase {
  protected:
   void MakeOp(int stride) {
-
     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
-                      .Input(FakeInput(DT_FLOAT))
-                      .Input(FakeInput(DT_FLOAT))
-                      .Attr("strides", {1, stride,stride, 1})
-                      .Attr("padding", "VALID" )
-                      .Finalize(node_def()));
-
+                    .Input(FakeInput(DT_FLOAT))
+                    .Input(FakeInput(DT_FLOAT))
+                    .Attr("strides", {1, stride, stride, 1})
+                    .Attr("padding", "VALID")
+                    .Finalize(node_def()));
 
     TF_ASSERT_OK(InitOp());
   }
 };
 
 TEST_F(XsmmConv2DTest, Basic) {
-     MakeOp(1);
+  MakeOp(1);
 
-     // setup scoped allocator, which uses cpu_allocator() for this scope
-     const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
+  // setup scoped allocator, which uses cpu_allocator() for this scope
+  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
 
-     int ifw = 14;           /* input width, "W" */
-     int ifh = 14;           /* input height, "H" */
-     int nImg = 32;          /* mini-batch size, "N" */
-     int nIfm = 64;         /* number of input feature maps, "C" */
-     int nOfm = 64;         /* number of output feature maps, "K" */
-     int kh = 3;             /* filter height, "R" */
-     int kw = 3;             /* filter width, "S" */
-     int pad = 0;            /* padding in output */
-     int stride = 1;         /* stride when accessing inputs */
+  int ifw = 14;   /* input width, "W" */
+  int ifh = 14;   /* input height, "H" */
+  int nImg = 32;  /* mini-batch size, "N" */
+  int nIfm = 64;  /* number of input feature maps, "C" */
+  int nOfm = 64;  /* number of output feature maps, "K" */
+  int kh = 3;     /* filter height, "R" */
+  int kw = 3;     /* filter width, "S" */
+  int pad = 0;    /* padding in output */
+  int stride = 1; /* stride when accessing inputs */
 
+  int stride_w = stride;
+  int stride_h = stride;
+  int pad_h = pad;
+  int pad_w = pad;
 
-     int stride_w = stride;
-     int stride_h = stride;
-     int pad_h = pad;
-     int pad_w = pad;
+  int pad_h_in = pad_h;
+  int pad_w_in = pad_w;
 
-     int pad_h_in = pad_h;
-     int pad_w_in = pad_w;
-
-     int pad_h_out = 0;
-     int pad_w_out = 0;
+  int pad_h_out = 0;
+  int pad_w_out = 0;
 
   /* deriving some values for naive code */
-     int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
-     int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
-     int ifhp = ifh + 2 * pad_h_in;
-     int ifwp = ifw + 2 * pad_w_in;
-     int ofhp = ofh + 2 * pad_h_out;
-     int ofwp = ofw + 2 * pad_w_out;
+  int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
+  int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
+  int ifhp = ifh + 2 * pad_h_in;
+  int ifwp = ifw + 2 * pad_w_in;
+  int ofhp = ofh + 2 * pad_h_out;
+  int ofwp = ofw + 2 * pad_w_out;
 
+  // Initialization of Filter and Image
 
-    //Initialization of Filter and Image
+  /* allocate data */
+  float* naive_input = (float*)libxsmm_aligned_scratch(
+      nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152);
+  float* naive_output = (float*)libxsmm_aligned_scratch(
+      nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152);
+  float* naive_filter = (float*)libxsmm_aligned_scratch(
+      nOfm * nIfm * kh * kw * sizeof(float), 2097152);
+  /* initialize data */
+  init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0);
+  zero_buf(naive_output, nImg * nOfm * ofhp * ofwp);
+  init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0);
 
-    /* allocate data */
-     float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
-     float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
-     float *naive_filter          = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
-     /* initialize data */
-     init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
-     zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
-     init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
+  Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm});
 
+  Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm});
 
-     Tensor image(DT_FLOAT,
-                 {nImg, ifhp, ifwp, nIfm});
+  naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
+  naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
 
+  // Run naive convolution
 
-     Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
+  naive_conv_t naive_param;
 
+  naive_param.nImg = nImg;
+  naive_param.nIfm = nIfm;
+  naive_param.nOfm = nOfm;
+  naive_param.ifhp = ifhp;
+  naive_param.ifwp = ifwp;
+  naive_param.ofhp = ofhp;
+  naive_param.ofwp = ofwp;
+  naive_param.ifh = ifh;
+  naive_param.ifw = ifw;
+  naive_param.ofh = ofh;
+  naive_param.ofw = ofw;
+  naive_param.pad_h = pad_h;
+  naive_param.pad_w = pad_w;
+  naive_param.pad_h_in = pad_h_in;
+  naive_param.pad_w_in = pad_w_in;
+  naive_param.pad_h_out = pad_h_out;
+  naive_param.pad_w_out = pad_w_out;
+  naive_param.kh = kh;
+  naive_param.kw = kw;
+  naive_param.stride_h = stride_h;
+  naive_param.stride_w = stride_w;
 
-     naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
-     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
+  naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
 
+  AddInputFromArray<float>(image.shape(), image.flat<float>());
+  AddInputFromArray<float>(filter.shape(), filter.flat<float>());
 
-    //Run naive convolution
-
-     naive_conv_t naive_param;
-
-     naive_param.nImg = nImg;
-     naive_param.nIfm = nIfm;
-     naive_param.nOfm = nOfm;
-     naive_param.ifhp = ifhp;
-     naive_param.ifwp = ifwp;
-     naive_param.ofhp = ofhp;
-     naive_param.ofwp = ofwp;
-     naive_param.ifh = ifh;
-     naive_param.ifw = ifw;
-     naive_param.ofh = ofh;
-     naive_param.ofw = ofw;
-     naive_param.pad_h = pad_h;
-     naive_param.pad_w = pad_w;
-     naive_param.pad_h_in = pad_h_in;
-     naive_param.pad_w_in = pad_w_in;
-     naive_param.pad_h_out = pad_h_out;
-     naive_param.pad_w_out = pad_w_out;
-     naive_param.kh = kh;
-     naive_param.kw = kw;
-     naive_param.stride_h = stride_h;
-     naive_param.stride_w = stride_w;
-
-
-     naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
-
-
-
-     AddInputFromArray<float>(image.shape(), image.flat<float>());
-     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
-
-
-
-     //Run Op (TF)
-     TF_ASSERT_OK(RunOpKernel());
-
-     // Check the output.
-     Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
-     naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
-
-
-     test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
-     libxsmm_free(naive_input);
-     libxsmm_free(naive_output);
-     libxsmm_free(naive_filter);
-
+  // Run Op (TF)
+  TF_ASSERT_OK(RunOpKernel());
 
+  // Check the output.
+  Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm});
+  naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
 
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+  libxsmm_free(naive_input);
+  libxsmm_free(naive_output);
+  libxsmm_free(naive_filter);
 }
 
 /*
@@ -325,7 +308,8 @@ TEST(XsmmConv2DTest, Basic) {
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.filter_format =
+LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 58a50a70c26..49f74ff47fb 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -131,7 +131,7 @@ inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
   while (auto _result = ::tensorflow::TfCheckOpHelper(val, #val)) \
   LOG(level) << *(_result)
 
-#define TF_CHECK_OK(val)  TF_DO_CHECK_OK(val, FATAL)
+#define TF_CHECK_OK(val) TF_DO_CHECK_OK(val, FATAL)
 #define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL)
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 2b10ebeaf7c..e55ed79d36c 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -66,7 +66,9 @@ struct EigenEnvironment {
     }
     return Task{
         std::unique_ptr<TaskImpl>(new TaskImpl{
-            std::move(f), Context(ContextKind::kThread), id,
+            std::move(f),
+            Context(ContextKind::kThread),
+            id,
         }),
     };
   }
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 49ddb16645c..627ef5a892a 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -97,8 +97,8 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
     }
     pool.ParallelForWithWorkerId(
         kWorkItems, kHugeCost,
-        [&threads_running, &work, num_threads](
-            int64 begin, int64 end, int64 id) {
+        [&threads_running, &work, num_threads](int64 begin, int64 end,
+                                               int64 id) {
           // Store true for the current thread, and assert that another thread
           // is not running with the same id.
           ASSERT_LE(0, id);
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 0faa458f1d6..efe97f78d25 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -18,12 +18,12 @@ limitations under the License.
 #include <mutex>
 
 #include "sqlite3.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/lib/core/refcount.h"
 
 /// TensorFlow SQLite Veneer
 ///
@@ -121,10 +121,7 @@ class LOCKABLE Sqlite : public core::RefCounted {
 
   Sqlite(sqlite3* db, sqlite3_stmt* begin, sqlite3_stmt* commit,
          sqlite3_stmt* rollback) noexcept
-      : db_(db),
-        begin_(begin),
-        commit_(commit),
-        rollback_(rollback) {}
+      : db_(db), begin_(begin), commit_(commit), rollback_(rollback) {}
 
   sqlite3* const db_;
   sqlite3_stmt* const begin_;
@@ -233,7 +230,8 @@ class SqliteStatement {
   /// freed until this statement is Reset() or finalized.
   void BindText(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_TRANSIENT, SQLITE_UTF8), parameter);
+                               SQLITE_TRANSIENT, SQLITE_UTF8),
+           parameter);
     size_ += text.size();
   }
   void BindText(const char* parameter, const StringPiece& text) {
@@ -241,7 +239,8 @@ class SqliteStatement {
   }
   void BindTextUnsafe(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_STATIC, SQLITE_UTF8), parameter);
+                               SQLITE_STATIC, SQLITE_UTF8),
+           parameter);
     size_ += text.size();
   }
   void BindTextUnsafe(const char* parameter, const StringPiece& text) {
@@ -254,7 +253,8 @@ class SqliteStatement {
   /// freed until this statement is Reset() or finalized.
   void BindBlob(int parameter, const StringPiece& blob) {
     Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_TRANSIENT), parameter);
+                               SQLITE_TRANSIENT),
+           parameter);
     size_ += blob.size();
   }
   void BindBlob(const char* parameter, const StringPiece& blob) {
@@ -262,7 +262,8 @@ class SqliteStatement {
   }
   void BindBlobUnsafe(int parameter, const StringPiece& blob) {
     Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_STATIC), parameter);
+                               SQLITE_STATIC),
+           parameter);
     size_ += blob.size();
   }
   void BindBlobUnsafe(const char* parameter, const StringPiece& text) {
@@ -320,9 +321,7 @@ class SqliteStatement {
 
   /// \brief Move constructor, after which <other> is reset to empty.
   SqliteStatement(SqliteStatement&& other) noexcept
-      : db_(other.db_),
-        stmt_(other.stmt_),
-        bind_error_(other.bind_error_) {
+      : db_(other.db_), stmt_(other.stmt_), bind_error_(other.bind_error_) {
     other.db_ = nullptr;
     other.stmt_ = nullptr;
     other.bind_error_ = SQLITE_OK;
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index c9c76ea5f2c..1e88323d017 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -33,9 +33,7 @@ class SqliteTest : public ::testing::Test {
     db_->PrepareOrDie("CREATE TABLE T (a BLOB, b BLOB)").StepAndResetOrDie();
   }
 
-  void TearDown() override {
-    db_->Unref();
-  }
+  void TearDown() override { db_->Unref(); }
 
   Sqlite* db_;
   bool is_done_;
@@ -213,7 +211,7 @@ TEST_F(SqliteTest, BindFailed) {
   Status s = stmt.StepOnce();
   EXPECT_NE(string::npos,
             s.error_message().find("INSERT INTO T (a) VALUES (123)"))
-            << s.error_message();
+      << s.error_message();
 }
 
 TEST_F(SqliteTest, SnappyExtension) {
@@ -226,7 +224,7 @@ TEST_F(SqliteTest, SnappyBinaryCompatibility) {
   EXPECT_EQ(
       "today is the end of the republic",
       db_->PrepareOrDie("SELECT UNSNAP(X'03207C746F6461792069732074686520656E64"
-                            "206F66207468652072657075626C6963')")
+                        "206F66207468652072657075626C6963')")
           .StepOnceOrDie()
           .ColumnString(0));
 }
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 0f6999c88fc..e5deb2b873e 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -44,6 +44,14 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
   return 0;
 }
 
+static const char* GifErrorStringNonNull(int error_code) {
+  const char* error_string = GifErrorString(error_code);
+  if (error_string == nullptr) {
+    return "Unknown error";
+  }
+  return error_string;
+}
+
 uint8* Decode(const void* srcdata, int datasize,
               const std::function<uint8*(int, int, int, int)>& allocate_output,
               string* error_string) {
@@ -55,17 +63,17 @@ uint8* Decode(const void* srcdata, int datasize,
     int error_code = D_GIF_SUCCEEDED;
     if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) {
       LOG(WARNING) << "Fail to close gif file, reason: "
-                   << GifErrorString(error_code);
+                   << GifErrorStringNonNull(error_code);
     }
   });
   if (error_code != D_GIF_SUCCEEDED) {
     *error_string = strings::StrCat("failed to open gif file: ",
-                                    GifErrorString(error_code));
+                                    GifErrorStringNonNull(error_code));
     return nullptr;
   }
   if (DGifSlurp(gif_file) != GIF_OK) {
     *error_string = strings::StrCat("failed to slurp gif file: ",
-                                    GifErrorString(gif_file->Error));
+                                    GifErrorStringNonNull(gif_file->Error));
     return nullptr;
   }
   if (gif_file->ImageCount <= 0) {
diff --git a/tensorflow/core/lib/gtl/cleanup.h b/tensorflow/core/lib/gtl/cleanup.h
index 6053e986402..6bd60ca4824 100644
--- a/tensorflow/core/lib/gtl/cleanup.h
+++ b/tensorflow/core/lib/gtl/cleanup.h
@@ -55,22 +55,21 @@ namespace gtl {
 template <typename F>
 class Cleanup {
  public:
-  Cleanup()
-      : released_(true), f_() {}
+  Cleanup() : released_(true), f_() {}
 
   template <typename G>
-  explicit Cleanup(G&& f)  // NOLINT
+  explicit Cleanup(G&& f)          // NOLINT
       : f_(std::forward<G>(f)) {}  // NOLINT(build/c++11)
 
   Cleanup(Cleanup&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Implicitly move-constructible from any compatible Cleanup<G>.
   // The source will be released as if src.release() were called.
   // A moved-from Cleanup can be safely destroyed or reassigned.
   template <typename G>
   Cleanup(Cleanup<G>&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Assignment to a Cleanup object behaves like destroying it
   // and making a new one in its place, analogous to unique_ptr
@@ -102,8 +101,8 @@ class Cleanup {
   F f_;
 };
 
-template <int&... ExplicitParameterBarrier,
-          typename F, typename DecayF = typename std::decay<F>::type>
+template <int&... ExplicitParameterBarrier, typename F,
+          typename DecayF = typename std::decay<F>::type>
 TF_MUST_USE_RESULT Cleanup<DecayF> MakeCleanup(F&& f) {
   return Cleanup<DecayF>(std::forward<F>(f));
 }
diff --git a/tensorflow/core/lib/gtl/cleanup_test.cc b/tensorflow/core/lib/gtl/cleanup_test.cc
index bd151cb2ab1..a86ffd5fe28 100644
--- a/tensorflow/core/lib/gtl/cleanup_test.cc
+++ b/tensorflow/core/lib/gtl/cleanup_test.cc
@@ -65,15 +65,14 @@ TEST(CleanupTest, Release) {
 TEST(FinallyTest, TypeErasedWithoutFactory) {
   string s = "active";
   {
-    AnyCleanup s_cleaner([&s]{ s.append(" clean"); });
+    AnyCleanup s_cleaner([&s] { s.append(" clean"); });
     EXPECT_EQ("active", s);
   }
   EXPECT_EQ("active clean", s);
 }
 
 struct Appender {
-  Appender(string* s, const string& msg)
-      : s_(s), msg_(msg) {}
+  Appender(string* s, const string& msg) : s_(s), msg_(msg) {}
   void operator()() const { s_->append(msg_); }
   string* s_;
   string msg_;
@@ -163,7 +162,12 @@ class CleanupReferenceTest : public ::testing::Test {
     int* i;
     F(int* cp, int* i) : cp(cp), i(i) {}
     F(const F& o) : cp(o.cp), i(o.i) { ++*cp; }
-    F& operator=(const F& o) { cp = o.cp; i = o.i; ++*cp; return *this; }
+    F& operator=(const F& o) {
+      cp = o.cp;
+      i = o.i;
+      ++*cp;
+      return *this;
+    }
     F(F&&) = default;
     F& operator=(F&&) = default;
     void operator()() const { ++*i; }
@@ -279,7 +283,7 @@ BENCHMARK(BM_AnyCleanup);
 
 void BM_AnyCleanupNoFactory(int iters) {
   while (iters--) {
-    AnyCleanup fin([]{Incr();});
+    AnyCleanup fin([] { Incr(); });
   }
 }
 BENCHMARK(BM_AnyCleanupNoFactory);
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index d6e5d9effa7..6e3cb2206d9 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -31,12 +31,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 #define TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 
-#include <cstddef>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <algorithm>
+#include <cstddef>
 #include <iterator>
 #include <memory>
 #include <type_traits>
@@ -407,7 +407,7 @@ class InlinedVector {
   };
   // 2) Construct a T with args at not-yet-initialized memory pointed by dst.
   struct Construct {
-    template<class... Args>
+    template <class... Args>
     void operator()(T* dst, Args&&... args) const {
       new (dst) T(std::forward<Args>(args)...);
     }
diff --git a/tensorflow/core/lib/gtl/int_type.h b/tensorflow/core/lib/gtl/int_type.h
index 647fc81aa7e..af3e50ad78f 100644
--- a/tensorflow/core/lib/gtl/int_type.h
+++ b/tensorflow/core/lib/gtl/int_type.h
@@ -255,13 +255,13 @@ class IntType {
     value_ op arg_value;                             \
     return *this;                                    \
   }
-  INT_TYPE_ASSIGNMENT_OP(+= );
-  INT_TYPE_ASSIGNMENT_OP(-= );
-  INT_TYPE_ASSIGNMENT_OP(*= );
-  INT_TYPE_ASSIGNMENT_OP(/= );
-  INT_TYPE_ASSIGNMENT_OP(<<= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(>>= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(%= );
+  INT_TYPE_ASSIGNMENT_OP(+=);
+  INT_TYPE_ASSIGNMENT_OP(-=);
+  INT_TYPE_ASSIGNMENT_OP(*=);
+  INT_TYPE_ASSIGNMENT_OP(/=);
+  INT_TYPE_ASSIGNMENT_OP(<<=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(>>=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(%=);
 #undef INT_TYPE_ASSIGNMENT_OP
 
   ThisType& operator=(ValueType arg_value) {
@@ -314,10 +314,10 @@ std::ostream& operator<<(std::ostream& os,  // NOLINT
 INT_TYPE_ARITHMETIC_OP(+);
 INT_TYPE_ARITHMETIC_OP(-);
 INT_TYPE_ARITHMETIC_OP(*);
-INT_TYPE_ARITHMETIC_OP(/ );
-INT_TYPE_ARITHMETIC_OP(<< );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(>> );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(% );
+INT_TYPE_ARITHMETIC_OP(/);
+INT_TYPE_ARITHMETIC_OP(<<);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(>>);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(%);
 #undef INT_TYPE_ARITHMETIC_OP
 
 // -- NON-MEMBER COMPARISON OPERATORS ------------------------------------------
@@ -345,12 +345,12 @@ INT_TYPE_ARITHMETIC_OP(% );
       IntType<IntTypeName, ValueType> id) {                      \
     return val op id.value();                                    \
   }
-INT_TYPE_COMPARISON_OP(== );  // NOLINT
-INT_TYPE_COMPARISON_OP(!= );  // NOLINT
-INT_TYPE_COMPARISON_OP(< );   // NOLINT
-INT_TYPE_COMPARISON_OP(<= );  // NOLINT
-INT_TYPE_COMPARISON_OP(> );   // NOLINT
-INT_TYPE_COMPARISON_OP(>= );  // NOLINT
+INT_TYPE_COMPARISON_OP(==);  // NOLINT
+INT_TYPE_COMPARISON_OP(!=);  // NOLINT
+INT_TYPE_COMPARISON_OP(<);   // NOLINT
+INT_TYPE_COMPARISON_OP(<=);  // NOLINT
+INT_TYPE_COMPARISON_OP(>);   // NOLINT
+INT_TYPE_COMPARISON_OP(>=);  // NOLINT
 #undef INT_TYPE_COMPARISON_OP
 
 }  // namespace gtl
diff --git a/tensorflow/core/lib/gtl/int_type_test.cc b/tensorflow/core/lib/gtl/int_type_test.cc
index d3c405d9acd..61d364017cb 100644
--- a/tensorflow/core/lib/gtl/int_type_test.cc
+++ b/tensorflow/core/lib/gtl/int_type_test.cc
@@ -42,7 +42,8 @@ class IntTypeTest : public ::testing::Test {
 
 // All tests below will be executed on all supported IntTypes.
 typedef ::testing::Types<Int8_IT, UInt8_IT, Int16_IT, UInt16_IT, Int32_IT,
-                         Int64_IT, UInt64_IT, Long_IT> SupportedIntTypes;
+                         Int64_IT, UInt64_IT, Long_IT>
+    SupportedIntTypes;
 
 TYPED_TEST_CASE(IntTypeTest, SupportedIntTypes);
 
@@ -232,7 +233,8 @@ TYPED_TEST(IntTypeTest, TestOperators) {
 
 TYPED_TEST(IntTypeTest, TestHashFunctor) {
   std::unordered_map<typename TestFixture::T, char,
-                     typename TestFixture::T::Hasher> map;
+                     typename TestFixture::T::Hasher>
+      map;
   typename TestFixture::T a(0);
   map[a] = 'c';
   EXPECT_EQ('c', map[a]);
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 2ff8b9c7d1a..fa33c24c0c0 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -593,12 +593,12 @@ class optional : private internal_optional::optional_data<T>,
     assert(this->engaged_);
     return this->pointer();
   }
-  constexpr const T& operator*() const & { return reference(); }
+  constexpr const T& operator*() const& { return reference(); }
   T& operator*() & {
     assert(this->engaged_);
     return reference();
   }
-  constexpr const T&& operator*() const && { return std::move(reference()); }
+  constexpr const T&& operator*() const&& { return std::move(reference()); }
   T&& operator*() && {
     assert(this->engaged_);
     return std::move(reference());
@@ -621,7 +621,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value()` to get a reference to underlying value.  The constness
   // and lvalue/rvalue-ness of `opt` is preserved to the view of the T
   // subobject.
-  const T& value() const & {
+  const T& value() const& {
     CHECK(*this) << "Bad optional access";
     return reference();
   }
@@ -633,7 +633,7 @@ class optional : private internal_optional::optional_data<T>,
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
-  const T&& value() const && {  // NOLINT(build/c++11)
+  const T&& value() const&& {  // NOLINT(build/c++11)
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
@@ -641,7 +641,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value_or(val)` to get either the value of T or the given default
   // `val` in the empty case.
   template <class U>
-  constexpr T value_or(U&& v) const & {
+  constexpr T value_or(U&& v) const& {
     return static_cast<bool>(*this) ? **this
                                     : static_cast<T>(std::forward<U>(v));
   }
@@ -656,8 +656,8 @@ class optional : private internal_optional::optional_data<T>,
   constexpr const T& reference() const { return *this->pointer(); }
   T& reference() { return *(this->pointer()); }
 
-  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t or
-  // a reference.
+  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t
+  // or a reference.
   static_assert(
       !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
       "optional<nullopt_t> is not allowed.");
diff --git a/tensorflow/core/lib/gtl/optional_test.cc b/tensorflow/core/lib/gtl/optional_test.cc
index 547bee7b75f..12b5bbc60be 100644
--- a/tensorflow/core/lib/gtl/optional_test.cc
+++ b/tensorflow/core/lib/gtl/optional_test.cc
@@ -24,17 +24,29 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-using tensorflow::gtl::optional;
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::nullopt_t;
 using tensorflow::gtl::in_place;
 using tensorflow::gtl::in_place_t;
 using tensorflow::gtl::make_optional;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::nullopt_t;
+using tensorflow::gtl::optional;
 
-template <typename T> string TypeQuals(T&) { return "&"; }
-template <typename T> string TypeQuals(T&&) { return "&&"; }
-template <typename T> string TypeQuals(const T&) { return "c&"; }
-template <typename T> string TypeQuals(const T&&) { return "c&&"; }
+template <typename T>
+string TypeQuals(T&) {
+  return "&";
+}
+template <typename T>
+string TypeQuals(T&&) {
+  return "&&";
+}
+template <typename T>
+string TypeQuals(const T&) {
+  return "c&";
+}
+template <typename T>
+string TypeQuals(const T&&) {
+  return "c&&";
+}
 
 struct StructorListener {
   int construct0 = 0;
diff --git a/tensorflow/core/lib/gtl/top_n_test.cc b/tensorflow/core/lib/gtl/top_n_test.cc
index fae85570dc0..ba30c072a90 100644
--- a/tensorflow/core/lib/gtl/top_n_test.cc
+++ b/tensorflow/core/lib/gtl/top_n_test.cc
@@ -28,10 +28,10 @@ limitations under the License.
 
 namespace {
 
+using tensorflow::string;
 using tensorflow::gtl::TopN;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::string;
 
 // Move the contents from an owned raw pointer, returning by value.
 // Objects are easier to manage by value.
diff --git a/tensorflow/core/lib/io/compression.cc b/tensorflow/core/lib/io/compression.cc
index c12de98e401..0d25bca9ecc 100644
--- a/tensorflow/core/lib/io/compression.cc
+++ b/tensorflow/core/lib/io/compression.cc
@@ -22,6 +22,6 @@ namespace compression {
 const char kNone[] = "";
 const char kGzip[] = "GZIP";
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/compression.h b/tensorflow/core/lib/io/compression.h
index ef90c60a3a4..4d8e7788cad 100644
--- a/tensorflow/core/lib/io/compression.h
+++ b/tensorflow/core/lib/io/compression.h
@@ -23,8 +23,8 @@ namespace compression {
 extern const char kNone[];
 extern const char kGzip[];
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 403c82818ef..9cc6c4034f4 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -207,7 +207,7 @@ Status RecordReader::SkipNBytes(uint64 offset) {
     }
   }
   return Status::OK();
-}
+}  // namespace io
 
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 507c26a63ff..b7e51256a22 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -218,8 +218,8 @@ TEST_F(RecordioTest, RandomRead) {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST_F(RecordioTest, ReadError) {
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 354c819b090..77a3414442c 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -197,8 +197,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
                       int desired_channel_bits, DecodeContext* context) {
   CHECK(desired_channel_bits == 8 || desired_channel_bits == 16)
       << "desired_channel_bits = " << desired_channel_bits;
-  CHECK(0 <= desired_channels && desired_channels <= 4) << "desired_channels = "
-                                                        << desired_channels;
+  CHECK(0 <= desired_channels && desired_channels <= 4)
+      << "desired_channels = " << desired_channels;
   context->error_condition = false;
   context->channels = desired_channels;
   context->png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, context,
diff --git a/tensorflow/core/lib/random/philox_random_test_utils.h b/tensorflow/core/lib/random/philox_random_test_utils.h
index f4bb087e107..6c29ae6b6a2 100644
--- a/tensorflow/core/lib/random/philox_random_test_utils.h
+++ b/tensorflow/core/lib/random/philox_random_test_utils.h
@@ -35,8 +35,8 @@ void FillRandoms(PhiloxRandom gen, typename Distribution::ResultElementType* p,
                  int64 size) {
   const int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   Distribution dist;
   for (int i = 0; i < size; i += granularity) {
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 0e281403f87..3fe1f9bc6cf 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
 
 #define _USE_MATH_DEFINES
-#include <cmath>
 #include <math.h>
+#include <cmath>
 #undef _USE_MATH_DEFINES
 
 #include <string.h>
@@ -27,7 +27,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/random/philox_random.h"
 
-
 namespace tensorflow {
 namespace random {
 
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 90d0dba4a77..85d68f456e1 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -45,8 +45,8 @@ void FillRandomsWithSingles(PhiloxRandom gen,
                             int64 size) {
   int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
 
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index af9a1512594..ef90050b4f6 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -472,7 +472,8 @@ void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) {
   // buf = val in network byte order, sign extended to 10 bytes
   const char sign_byte = val < 0 ? '\xff' : '\0';
   char buf[10] = {
-      sign_byte, sign_byte,
+      sign_byte,
+      sign_byte,
   };
   StoreBigEndian64(buf + 2, val);
   static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 5835b0101d9..2bc14945cd0 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -126,7 +126,7 @@ class AlphaNum {
       : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {}
 
   AlphaNum(const Eigen::half &f);  // NOLINT(runtime/explicit)
-  AlphaNum(Hex hex);  // NOLINT(runtime/explicit)
+  AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
   AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
   AlphaNum(const StringPiece &pc) : piece_(pc) {}  // NOLINT(runtime/explicit)
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 279a5876f96..267ce884400 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -335,6 +335,13 @@ REGISTER_OP("Unpack")
       return Status::OK();
     });
 
+REGISTER_OP("UnravelIndex")
+    .Input("indices: Tidx")
+    .Input("dims: Tidx")
+    .Output("output: Tidx")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
+
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
 // in the N == 1 case to remove the node.
@@ -701,10 +708,11 @@ REGISTER_OP("MatrixDiagPart")
 // --------------------------------------------------------------------------
 REGISTER_OP("MatrixBandPart")
     .Input("input: T")
-    .Input("num_lower: int64")
-    .Input("num_upper: int64")
+    .Input("num_lower: Tindex")
+    .Input("num_upper: Tindex")
     .Output("band: T")
     .Attr("T: type")
+    .Attr("Tindex: {int32, int64} = DT_INT64")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
@@ -977,8 +985,8 @@ REGISTER_OP("GatherNd")
       if (c->Value(r_dim) > c->Rank(params)) {
         return errors::InvalidArgument(
             "indices.shape[-1] must be <= params.rank, but saw indices shape: ",
-            c->DebugString(indices), " and params shape: ",
-            c->DebugString(params));
+            c->DebugString(indices),
+            " and params shape: ", c->DebugString(params));
       }
 
       // Remove r_dim from indices to get output.
@@ -1252,12 +1260,12 @@ REGISTER_OP("ReverseSequence")
       // Validate batch_dim and seq_dim against input.
       const int32 input_rank = c->Rank(input);
       if (batch_dim >= input_rank) {
-        return errors::InvalidArgument("batch_dim must be < input rank: ",
-                                       batch_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "batch_dim must be < input rank: ", batch_dim, " vs. ", input_rank);
       }
       if (seq_dim >= input_rank) {
-        return errors::InvalidArgument("seq_dim must be < input rank: ",
-                                       seq_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "seq_dim must be < input rank: ", seq_dim, " vs. ", input_rank);
       }
 
       DimensionHandle batch_dim_dim = c->Dim(input, batch_dim);
@@ -2638,8 +2646,9 @@ Status ScatterNdShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ", outer_dims,
+            "The outer ", outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", outer_dims,
             " dimensions of updates.shape=", c->DebugString(updates_shape),
             ": ", s.error_message());
       }
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index a182fd1c475..86d64635f4c 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -142,8 +142,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) {
 
 TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   for (const char* op_name : {
-           "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize",
-           "StopGradient", "ZerosLike", "OnesLike",
+           "CheckNumerics",
+           "Identity",
+           "RefIdentity",
+           "QuantizeAndDequantize",
+           "StopGradient",
+           "ZerosLike",
+           "OnesLike",
        }) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?", "in0");
diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc
index c79b4439148..f3673716040 100644
--- a/tensorflow/core/ops/candidate_sampling_ops_test.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc
@@ -23,9 +23,12 @@ namespace tensorflow {
 
 TEST(CandidateSamplerOpsTest, CandidateSampler_ShapeFn) {
   for (const char* op_name : {
-           "AllCandidateSampler", "FixedUnigramCandidateSampler",
-           "LearnedUnigramCandidateSampler", "LogUniformCandidateSampler",
-           "ThreadUnsafeUnigramCandidateSampler", "UniformCandidateSampler",
+           "AllCandidateSampler",
+           "FixedUnigramCandidateSampler",
+           "LearnedUnigramCandidateSampler",
+           "LogUniformCandidateSampler",
+           "ThreadUnsafeUnigramCandidateSampler",
+           "UniformCandidateSampler",
        }) {
     ShapeInferenceTestOp op(op_name);
     TF_ASSERT_OK(NodeDefBuilder("test", op.name)
diff --git a/tensorflow/core/ops/compat/backwards_compatibility_test.cc b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
index add05d6610a..6e05ae4be4f 100644
--- a/tensorflow/core/ops/compat/backwards_compatibility_test.cc
+++ b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
@@ -25,8 +25,9 @@ namespace tensorflow {
 namespace {
 
 TEST(BackwardsCompatibilityTest, IsCompatible) {
-  OpCompatibilityLib compatibility(
-      "tensorflow/core/ops", strings::StrCat("v", TF_MAJOR_VERSION), nullptr);
+  OpCompatibilityLib compatibility("tensorflow/core/ops",
+                                   strings::StrCat("v", TF_MAJOR_VERSION),
+                                   nullptr);
 
   Env* env = Env::Default();
   int changed_ops = 0;
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 65ab81931ad..2580eaf987b 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -17136,6 +17136,24 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Enter"
   input_arg {
@@ -24840,6 +24858,42 @@ op {
     type: "type"
   }
 }
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "MatrixDeterminant"
   input_arg {
@@ -32096,6 +32150,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PreventGradient"
   input_arg {
@@ -42820,6 +42916,36 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyAdadelta"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 2cae814eab1..3c8e9a8a5f2 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -491,4 +491,29 @@ REGISTER_OP("StatsAggregatorSummary")
     .Output("summary: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("padded_shapes: N * int64")
+    .Input("padding_values: Toutput_types")
+    .Output("handle: variant")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
+    // of `Toutput_types` and `output_shapes` are `N`, that the
+    // length of `output_types` is `N`, the `output_shapes` are
+    // (as far as possible to tell statically) compatible with `padded_shapes`,
+    // and that `padding_values` are all scalars.
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("EnqueueInQueueDataset")
+    .Input("queue: variant")
+    .Input("components: Tcomponents")
+    .Attr("Tcomponents: list(type) >= 1")
+    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
+    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
+    // reading from queue handle (is that even possible?).
+    .SetShapeFn(shape_inference::NoOutputs);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_grad.cc b/tensorflow/core/ops/functional_grad.cc
index 6df3536795c..eeccb72da65 100644
--- a/tensorflow/core/ops/functional_grad.cc
+++ b/tensorflow/core/ops/functional_grad.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/function.h"
 #include <vector>
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index ef2ac267cc9..a62e2d782b8 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -586,6 +586,17 @@ REGISTER_OP("NonMaxSuppression")
     .Output("selected_indices: int32")
     .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
@@ -597,6 +608,19 @@ REGISTER_OP("NonMaxSuppressionV2")
     .Input("iou_threshold: float")
     .Output("selected_indices: int32")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      ShapeHandle iou_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index a67267418d6..444aa8b9544 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -102,6 +103,8 @@ REGISTER_OP("LookupTableFindV2")
       c->set_output(0, c->UnknownShape());
       return Status::OK();
     });
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2");
+// TODO(b/72710477): Update this.
 
 REGISTER_OP("LookupTableInsert")
     .Input("table_handle: Ref(string)")
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
new file mode 100644
index 00000000000..95b4774fe6e
--- /dev/null
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Roll")
+    .Input("input: T")
+    .Input("shift: Tshift")
+    .Input("axis: Taxis")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tshift: {int32,int64}")
+    .Attr("Taxis: {int32,int64}")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index dd484c3ee75..872ebe98c1f 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1172,12 +1172,12 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
   T limit = limit_t->scalar<T>()();
   T delta = delta_t->scalar<T>()();
   if (start > limit && delta > 0) {
-    return errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start <= limit when delta > 0: ", start, "/", limit);
   }
   if (start < limit && delta < 0) {
-    return errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start >= limit when delta < 0: ", start, "/", limit);
   }
   if (delta == 0) {
     return errors::InvalidArgument("Requires delta != 0");
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 3f72b415699..67481fd202b 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1155,9 +1155,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1211,9 +1211,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
@@ -1818,7 +1818,7 @@ REGISTER_OP("_MklMaxPool")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1844,7 +1844,7 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
@@ -1916,7 +1916,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
 #else
     .Output("workspace: uint8")
@@ -1944,7 +1944,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
 #else
     .Input("workspace: uint8")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index b57206c9c4f..8df126735b5 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7644,6 +7644,24 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Enter"
   input_arg {
@@ -12330,11 +12348,11 @@ op {
   }
   input_arg {
     name: "num_lower"
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   input_arg {
     name: "num_upper"
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   output_arg {
     name: "band"
@@ -12344,6 +12362,19 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "MatrixDeterminant"
@@ -15926,6 +15957,48 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PreventGradient"
   input_arg {
@@ -20925,27 +20998,6 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
     name: "Tindices"
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index f6cfbf873a0..8dae7e1ff5f 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -193,7 +193,7 @@ REGISTER_OP("ResourceScatterUpdate")
     .Input("resource: resource")
     .Input("indices: Tindices")
     .Input("updates: dtype")
-    .Attr("dtype: numbertype")
+    .Attr("dtype: type")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeAndType handle_shape_and_type;
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index e67d95fa8cb..4025070adb2 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::ShapeHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // --------------------------------------------------------------------------
 static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) {
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 8beb28de0a2..e4c5bcfb540 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -137,9 +137,9 @@ REGISTER_OP("Substr")
         DimensionHandle pos_dim = c->Dim(pos_shape, i);
         DimensionHandle len_dim = c->Dim(len_shape, i);
         if (c->Value(pos_dim) != c->Value(len_dim)) {
-          return errors::InvalidArgument("pos and len shapes must match: ",
-                                         c->DebugString(pos_shape), " vs. ",
-                                         c->DebugString(len_shape));
+          return errors::InvalidArgument(
+              "pos and len shapes must match: ", c->DebugString(pos_shape),
+              " vs. ", c->DebugString(len_shape));
         }
       }
       // c->input(0) is the ShapeHandle to input strings
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index de4e3cd9e70..0f309c1f4e9 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -24,7 +24,7 @@ static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
                                             const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
-      const char* var_spec, const char* grad_indices_spec) {
+                        const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
                            grad_indices_spec, shape_spec_end);
   };
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 07aecf84832..9ba25dea4fb 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -57,6 +57,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gcs_throttle",
+    srcs = ["gcs_throttle.cc"],
+    hdrs = ["gcs_throttle.h"],
+    copts = tf_copts(),
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "gcs_file_system",
     srcs = ["gcs_file_system.cc"],
@@ -69,6 +80,7 @@ cc_library(
         ":expiring_lru_cache",
         ":file_block_cache",
         ":gcs_dns_cache",
+        ":gcs_throttle",
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
@@ -271,6 +283,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gcs_throttle_test",
+    size = "small",
+    srcs = ["gcs_throttle_test.cc"],
+    linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
+    deps = [
+        ":gcs_throttle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "curl_http_request_test",
     size = "small",
diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc
index 0375af516b0..6add1142a15 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache.cc
@@ -131,6 +131,7 @@ Status FileBlockCache::MaybeFetch(const Key& key,
         block->mu.lock();  // Reacquire the lock immediately afterwards
         if (status.ok()) {
           block->data.resize(bytes_transferred, 0);
+          block->data.shrink_to_fit();
           downloaded_block = true;
           block->state = FetchState::FINISHED;
         } else {
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index 2b0e55bf371..4d9aff4d24f 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <arpa/inet.h>
 #include <netdb.h>
 #else
+#include <Windows.h>
 #include <winsock2.h>
 #include <ws2tcpip.h>
-#include <Windows.h>
 #endif
 #include <sys/types.h>
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 520720372d9..01ca0d76bab 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -50,7 +50,6 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-
 namespace {
 
 constexpr char kGcsUriBase[] = "https://www.googleapis.com/storage/v1/";
@@ -59,9 +58,6 @@ constexpr char kGcsUploadUriBase[] =
 constexpr char kStorageHost[] = "storage.googleapis.com";
 constexpr size_t kReadAppendableFileBufferSize = 1024 * 1024;  // In bytes.
 constexpr int kGetChildrenDefaultPageSize = 1000;
-// Initial delay before retrying a GCS upload.
-// Subsequent delays can be larger due to exponential back-off.
-constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
@@ -120,6 +116,15 @@ constexpr char kWriteRequestTimeout[] = "GCS_WRITE_REQUEST_TIMEOUT_SECS";
 // The environment variable to configure an additional header to send with
 // all requests to GCS (format HEADERNAME:HEADERCONTENT)
 constexpr char kAdditionalRequestHeader[] = "GCS_ADDITIONAL_REQUEST_HEADER";
+// The environment variable to configure the throttle (format: <int64>)
+constexpr char kThrottleRate[] = "GCS_THROTTLE_TOKEN_RATE";
+// The environment variable to configure the token bucket size (format: <int64>)
+constexpr char kThrottleBucket[] = "GCS_THROTTLE_BUCKET_SIZE";
+// The environment variable that controls the number of tokens per request.
+// (format: <int64>)
+constexpr char kTokensPerRequest[] = "GCS_TOKENS_PER_REQUEST";
+// The environment variable to configure the initial tokens (format: <int64>)
+constexpr char kInitialTokens[] = "GCS_INITIAL_TOKENS";
 
 // TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
@@ -725,6 +730,26 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kWriteRequestTimeout, strings::safe_strtou32, &timeout_value)) {
     timeouts_.write = timeout_value;
   }
+
+  int64 token_value;
+  if (GetEnvVar(kThrottleRate, strings::safe_strto64, &token_value)) {
+    GcsThrottleConfig config;
+    config.enabled = true;
+    config.token_rate = token_value;
+
+    if (GetEnvVar(kThrottleBucket, strings::safe_strto64, &token_value)) {
+      config.bucket_size = token_value;
+    }
+
+    if (GetEnvVar(kTokensPerRequest, strings::safe_strto64, &token_value)) {
+      config.tokens_per_request = token_value;
+    }
+
+    if (GetEnvVar(kInitialTokens, strings::safe_strto64, &token_value)) {
+      config.initial_tokens = token_value;
+    }
+    throttle_.SetConfig(config);
+  }
 }
 
 GcsFileSystem::GcsFileSystem(
@@ -778,7 +803,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object));
 
   std::unique_ptr<HttpRequest> request;
-  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                  "when reading gs://", bucket, "/", object);
+
   request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket, "/",
                                   request->EscapeString(object)));
   request->SetRange(offset, offset + n - 1);
@@ -793,6 +820,8 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
           << offset << " of size: " << bytes_read;
 
+  throttle_.RecordResponse(bytes_read);
+
   if (bytes_read < block_size()) {
     // Check stat cache to see if we encountered an interrupted read.
     FileStatistics stat;
@@ -930,41 +959,43 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
         "'object' must be a non-empty string. (File: %s)", fname.c_str()));
   }
 
-  StatCache::ComputeFunc compute_func =
-      [this, &bucket, &object](const string& fname, FileStatistics* stat) {
-        std::vector<char> output_buffer;
-        std::unique_ptr<HttpRequest> request;
-        TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
-        request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
-                                        request->EscapeString(object),
-                                        "?fields=size%2Cupdated"));
-        request->SetResultBuffer(&output_buffer);
-        request->SetTimeouts(timeouts_.connect, timeouts_.idle,
-                             timeouts_.metadata);
+  StatCache::ComputeFunc compute_func = [this, &bucket, &object](
+                                            const string& fname,
+                                            FileStatistics* stat) {
+    std::vector<char> output_buffer;
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
 
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
-                                        " when reading metadata of gs://",
-                                        bucket, "/", object);
+    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
+                                    request->EscapeString(object),
+                                    "?fields=size%2Cupdated"));
+    request->SetResultBuffer(&output_buffer);
+    request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
 
-        Json::Value root;
-        TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
 
-        // Parse file size.
-        TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
+    Json::Value root;
+    TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
 
-        // Parse file modification time.
-        string updated;
-        TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-        TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+    // Parse file size.
+    TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
 
-        VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
-                << " length: " << stat->length
-                << "; mtime_nsec: " << stat->mtime_nsec
-                << "; updated: " << updated;
+    // Parse file modification time.
+    string updated;
+    TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+    TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
 
-        stat->is_directory = false;
-        return Status::OK();
-      };
+    VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+            << " length: " << stat->length
+            << "; mtime_nsec: " << stat->mtime_nsec << "; updated: " << updated;
+
+    stat->is_directory = false;
+    return Status::OK();
+  };
 
   TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
   if (stat->is_directory) {
@@ -1442,6 +1473,10 @@ Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
                            additional_header_->second);
   }
 
+  if (!throttle_.AdmitRequest()) {
+    return errors::Unavailable("Request throttled");
+  }
+
   *request = std::move(new_request);
   return Status::OK();
 }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 2eae39608e3..e8edde8a445 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/expiring_lru_cache.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -194,6 +195,7 @@ class GcsFileSystem : public FileSystem {
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<FileBlockCache> file_block_cache_;
   std::unique_ptr<GcsDnsCache> dns_cache_;
+  GcsThrottle throttle_;
 
   using StatCache = ExpiringLRUCache<FileStatistics>;
   std::unique_ptr<StatCache> stat_cache_;
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc
new file mode 100644
index 00000000000..eb5f8958a37
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.cc
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+
+#include <algorithm>
+
+namespace tensorflow {
+
+GcsThrottle::GcsThrottle(EnvTime* env_time)
+    : last_updated_secs_(env_time->NowSeconds()),
+      available_tokens_(0),
+      env_time_(env_time) {}
+
+bool GcsThrottle::AdmitRequest() {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return true;
+  UpdateState();
+  if (available_tokens_ < config_.tokens_per_request) {
+    return false;
+  }
+  available_tokens_ -= config_.tokens_per_request;
+  return true;
+}
+
+void GcsThrottle::RecordResponse(size_t num_bytes) {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return;
+  UpdateState();
+  available_tokens_ -= request_bytes_to_tokens(num_bytes);
+}
+
+void GcsThrottle::SetConfig(GcsThrottleConfig config) {
+  mutex_lock l(mu_);
+  config_ = config;
+  available_tokens_ = config.initial_tokens;
+  last_updated_secs_ = env_time_->NowSeconds();
+}
+
+void GcsThrottle::UpdateState() {
+  // TODO(b/72643279): Switch to a monotonic clock.
+  int64 now = env_time_->NowSeconds();
+  uint64 delta_secs =
+      std::max(0LL, now - static_cast<int64>(last_updated_secs_));
+  available_tokens_ += delta_secs * config_.token_rate;
+  available_tokens_ = std::min(available_tokens_, config_.bucket_size);
+  last_updated_secs_ = now;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
new file mode 100644
index 00000000000..1a89daef084
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -0,0 +1,156 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+/**
+ * GcsThrottleConfig is used to configure the GcsThrottle.
+ */
+struct GcsThrottleConfig {
+  /**
+   * enabled is true if GcsThrottle should throttle requests, false otherwise.
+   */
+  bool enabled = false;
+
+  /**
+   * token_rate is the number of tokens accrued every second that can be used
+   * for making requests to the GCS service.
+   */
+  int64 token_rate = 100000;  // Approximately 800 MBits/second bandwidth-only.
+
+  /**
+   * bucket_size is the maximum number of available tokens the GcsThrottle can
+   * accrue.
+   */
+  int64 bucket_size = 10000000;  // 10 million tokens total
+
+  /**
+   * tokens_per_request determines the number of tokens consumed for every
+   * request.
+   *
+   * Note: tokens are also consumed in proportion to the response size.
+   */
+  int64 tokens_per_request = 100;
+
+  /**
+   * initial_tokens determines how many tokens should be available immediately
+   * after the GcsThrottle is constructed.
+   */
+  int64 initial_tokens = 0;
+};
+
+/**
+ * GcsThrottle is used to ensure fair use of the available GCS capacity.
+ *
+ * GcsThrottle operates around a concept of tokens. Tokens are consumed when
+ * making requests to the GCS service. Tokens are consumed both based on the
+ * number of requests made, as well as the bandwidth consumed (response sizes).
+ *
+ * GcsThrottle is thread safe and can be used from multiple threads.
+ */
+class GcsThrottle {
+ public:
+  /**
+   * Constructs a GcsThrottle.
+   */
+  explicit GcsThrottle(EnvTime* env_time = EnvTime::Default());
+
+  /**
+   * AdmitRequest updates the GcsThrottle to record a request will be made.
+   *
+   * AdmitRequest should be called before any request is made. AdmitRequest
+   * returns false if the request should be denied. If AdmitRequest
+   * returns false, no tokens are consumed. If true is returned, the configured
+   * number of tokens are consumed.
+   */
+  bool AdmitRequest();
+
+  /**
+   * RecordResponse updates the GcsThrottle to record a request has been made.
+   *
+   * RecordResponse should be called after the response has been received.
+   * RecordResponse will update the internal state based on the number of bytes
+   * in the response.
+   *
+   * Note: we split up the request and the response in this fashion in order to
+   * avoid penalizing consumers who are using large readahead buffers at higher
+   * layers of the I/O stack.
+   */
+  void RecordResponse(size_t num_bytes);
+
+  /**
+   * SetConfig sets the configuration for GcsThrottle and re-initializes state.
+   *
+   * After calling this, the token pool will be config.initial_tokens.
+   */
+  void SetConfig(GcsThrottleConfig config);
+
+  /**
+   * available_tokens gives a snapshot of how many tokens are available.
+   *
+   * The returned value should not be used to make admission decisions. The
+   * purpose of this function is to make available to monitoring or other
+   * instrumentation the number of available tokens in the pool.
+   */
+  inline int64 available_tokens() {
+    mutex_lock l(mu_);
+    if (!config_.enabled) return 0;
+    UpdateState();
+    return available_tokens_;
+  }
+
+ private:
+  /**
+   * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
+   *
+   * UpdateState should be called in order to mark the passage of time, and
+   * therefore add tokens to the availble_tokens_ pool.
+   */
+  void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  inline uint64 request_bytes_to_tokens(size_t num_bytes) {
+    return num_bytes >> 10;
+  }
+
+  mutex mu_;
+
+  /**
+   * last_updated_secs_ records the number of seconds since the Unix epoch that
+   * the internal state of the GcsThrottle was updated. This is important when
+   * determining the number of tokens to add to the available_tokens_ pool.
+   */
+  uint64 last_updated_secs_ GUARDED_BY(mu_) = 0;
+
+  /**
+   * available_tokens_ records how many tokens are available to be consumed.
+   *
+   * Note: it is possible for available_tokens_ to become negative. If a
+   * response comes back that consumes more than the available tokens, the count
+   * will go negative, and block future requests until we have available tokens.
+   */
+  int64 available_tokens_ GUARDED_BY(mu_) = 0;
+
+  EnvTime* const env_time_;
+  GcsThrottleConfig config_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
diff --git a/tensorflow/core/platform/cloud/gcs_throttle_test.cc b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
new file mode 100644
index 00000000000..694756022e3
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+class TestTime : public EnvTime {
+ public:
+  uint64 NowMicros() override { return now_; }
+
+  void SetTime(uint64 now_micros) { now_ = now_micros; }
+
+  void AdvanceSeconds(int64 secs) { now_ += secs * 1000000L; }
+
+ private:
+  uint64 now_ = 1234567890000000ULL;
+};
+
+class GcsThrottleTest : public ::testing::Test {
+ protected:
+  GcsThrottleTest() : throttle_(&time_) {
+    config_.enabled = true;
+    throttle_.SetConfig(config_);
+  }
+
+  GcsThrottleConfig config_;
+  TestTime time_;
+  GcsThrottle throttle_;
+};
+
+TEST_F(GcsThrottleTest, ReplenishTokens) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(2);
+  EXPECT_EQ(300000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, RejectRequest) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  EXPECT_EQ(99900, throttle_.available_tokens());
+  for (int i = 1; i < 1000; i++) {
+    EXPECT_TRUE(throttle_.AdmitRequest());
+  }
+  EXPECT_FALSE(throttle_.AdmitRequest());
+}
+
+TEST_F(GcsThrottleTest, MarkResponses) {
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  throttle_.RecordResponse(128000000);  // 128 MB response
+  EXPECT_EQ(-25100, throttle_.available_tokens());
+  EXPECT_FALSE(throttle_.AdmitRequest());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest())
+      << "Available tokens: " << throttle_.available_tokens();
+}
+
+TEST_F(GcsThrottleTest, Skippingtime_) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(90);
+  EXPECT_EQ(9000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, BucketLimit) {
+  time_.AdvanceSeconds(120);
+  EXPECT_EQ(10000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, ReverseTime) {
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(-3600);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(200000, throttle_.available_tokens());
+}
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/http_request_fake.h b/tensorflow/core/platform/cloud/http_request_fake.h
index 682b97f6ec6..7711eaceb29 100644
--- a/tensorflow/core/platform/cloud/http_request_fake.h
+++ b/tensorflow/core/platform/cloud/http_request_fake.h
@@ -38,8 +38,7 @@ class FakeHttpRequest : public CurlHttpRequest {
  public:
   /// Return the response for the given request.
   FakeHttpRequest(const string& request, const string& response)
-      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {
-  }
+      : FakeHttpRequest(request, response, Status::OK(), nullptr, {}, 200) {}
 
   /// Return the response with headers for the given request.
   FakeHttpRequest(const string& request, const string& response,
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 236259dbc16..ad569758cc6 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -160,12 +160,12 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   ASSERT_EQ(1, EVP_DigestVerifyInit(md_ctx, nullptr, md, nullptr, key));
   ASSERT_EQ(1, EVP_DigestVerifyUpdate(md_ctx, header_dot_claim.c_str(),
                                       header_dot_claim.size()));
-  ASSERT_EQ(
-      1,
-      EVP_DigestVerifyFinal(
-          md_ctx, const_cast<unsigned char*>(
-                      reinterpret_cast<const unsigned char*>(signature.data())),
-          signature.size()));
+  ASSERT_EQ(1,
+            EVP_DigestVerifyFinal(
+                md_ctx,
+                const_cast<unsigned char*>(
+                    reinterpret_cast<const unsigned char*>(signature.data())),
+                signature.size()));
   EVP_MD_CTX_cleanup(md_ctx);
 
   // Free all the crypto-related resources.
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
index c3b68313613..be9ebe67b18 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system.cc
@@ -25,7 +25,6 @@ namespace tensorflow {
 
 namespace {
 
-
 class RetryingRandomAccessFile : public RandomAccessFile {
  public:
   RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
@@ -203,4 +202,6 @@ Status RetryingFileSystem::DeleteRecursively(const string& dirname,
       initial_delay_microseconds_);
 }
 
+void RetryingFileSystem::FlushCaches() { base_file_system_->FlushCaches(); }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index d9d8ea6b004..a262a5fd940 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -69,6 +69,8 @@ class RetryingFileSystem : public FileSystem {
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
 
+  void FlushCaches() override;
+
  private:
   std::unique_ptr<FileSystem> base_file_system_;
   const int64 initial_delay_microseconds_;
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 232dcb3e71a..d3f763bb3c8 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -84,7 +84,8 @@ class MockWritableFile : public WritableFile {
 
 class MockFileSystem : public FileSystem {
  public:
-  explicit MockFileSystem(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockFileSystem(const ExpectedCalls& calls, bool* flushed = nullptr)
+      : calls_(calls), flushed_(flushed) {}
 
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
@@ -156,11 +157,18 @@ class MockFileSystem : public FileSystem {
     return calls_.ConsumeNextCall("DeleteRecursively");
   }
 
+  void FlushCaches() override {
+    if (flushed_) {
+      *flushed_ = true;
+    }
+  }
+
   std::unique_ptr<WritableFile> writable_file_to_return;
   std::unique_ptr<RandomAccessFile> random_access_file_to_return;
 
  private:
   MockCallSequence calls_;
+  bool* flushed_ = nullptr;
 };
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
@@ -702,5 +710,14 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       << status;
 }
 
+TEST(RetryingFileSystemTest, FlushCaches) {
+  ExpectedCalls none;
+  bool flushed = false;
+  std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
+  RetryingFileSystem fs(std::move(base_fs), 0);
+  fs.FlushCaches();
+  EXPECT_TRUE(flushed);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b0d7b3a67ae..7caf9d4db64 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag;
 void InfoAboutUnusedCPUFeatures() {
   std::call_once(g_cpu_feature_guard_warn_once_flag, [] {
     string missing_instructions;
-#ifdef PLATFORM_WINDOWS
+#if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __AVX__
     CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions);
 #endif  // __AVX__
 #ifndef __AVX2__
     CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions);
 #endif  // __AVX2__
-#else   // ifdef platform windows
+
+#else   // if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __SSE__
     CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions);
 #endif  // __SSE__
@@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() {
 #ifndef __FMA__
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
-#endif  // else of ifdef platform windows
+#endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
index 639f6804ea2..2d34239a995 100644
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc
@@ -27,8 +27,7 @@ TEST(CudaLibdevicePathTest, LibdevicePath) {
   VLOG(2) << "Libdevice root = " << LibdeviceRoot();
   std::vector<string> libdevice_files;
   TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
-      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"),
-      &libdevice_files));
+      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files));
   EXPECT_LT(0, libdevice_files.size());
 }
 #endif
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index f4b0f16393d..8e60a7f0910 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -579,8 +579,10 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
   // TODO(pbar) Handle device IDs and prefix properly.
   const string prefix = "";
   const int id = 0;
-  const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
-  const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
+  const string stream_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
+  const string memcpy_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 82bd69f9ca4..2b874da1981 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -83,15 +83,14 @@ void LogMessage::GenerateLogMessage() {
   const size_t time_buffer_size = 30;
   char time_buffer[time_buffer_size];
   strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S",
-	   localtime(&now_seconds));
+           localtime(&now_seconds));
 
   // TODO(jeff,sanjay): Replace this with something that logs through the env.
   fprintf(stderr, "%s.%06d: %c %s:%d] %s\n", time_buffer, micros_remainder,
-	  "IWEF"[severity_], fname_, line_, str().c_str());
+          "IWEF"[severity_], fname_, line_, str().c_str());
 }
 #endif
 
-
 namespace {
 
 // Parse log level (int64) from environment variable (char*)
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 40c260f2366..f0efa31d557 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -19,8 +19,8 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/logging.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/logging.h
 
-#include <sstream>
 #include <limits>
+#include <sstream>
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -205,16 +205,18 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
   inline string* name##Impl(int v1, int v2, const char* exprtext) {       \
     return name##Impl<int, int>(v1, v2, exprtext);                        \
   }                                                                       \
-  inline string* name##Impl(const size_t v1, const int v2, const char* exprtext) {       \
+  inline string* name##Impl(const size_t v1, const int v2,                \
+                            const char* exprtext) {                       \
     if (TF_PREDICT_FALSE(v2 < 0)) {                                       \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v1);                           \
     return name##Impl<size_t, size_t>(uval, v2, exprtext);                \
   }                                                                       \
-  inline string* name##Impl(const int v1, const size_t v2, const char* exprtext) {       \
-    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {      \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+  inline string* name##Impl(const int v1, const size_t v2,                \
+                            const char* exprtext) {                       \
+    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {        \
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v2);                           \
     return name##Impl<size_t, size_t>(v1, uval, exprtext);                \
@@ -225,12 +227,12 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
 // This happens if, for example, those are used as token names in a
 // yacc grammar.
 TF_DEFINE_CHECK_OP_IMPL(Check_EQ,
-                        == )  // Compilation error with CHECK_EQ(NULL, x)?
-TF_DEFINE_CHECK_OP_IMPL(Check_NE, != )  // Use CHECK(x == NULL) instead.
-TF_DEFINE_CHECK_OP_IMPL(Check_LE, <= )
-TF_DEFINE_CHECK_OP_IMPL(Check_LT, < )
-TF_DEFINE_CHECK_OP_IMPL(Check_GE, >= )
-TF_DEFINE_CHECK_OP_IMPL(Check_GT, > )
+                        ==)  // Compilation error with CHECK_EQ(NULL, x)?
+TF_DEFINE_CHECK_OP_IMPL(Check_NE, !=)  // Use CHECK(x == NULL) instead.
+TF_DEFINE_CHECK_OP_IMPL(Check_LE, <=)
+TF_DEFINE_CHECK_OP_IMPL(Check_LT, <)
+TF_DEFINE_CHECK_OP_IMPL(Check_GE, >=)
+TF_DEFINE_CHECK_OP_IMPL(Check_GT, >)
 #undef TF_DEFINE_CHECK_OP_IMPL
 
 // In optimized mode, use CheckOpString to hint to compiler that
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index f13b0af2a79..e00dbdb4ae5 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -41,8 +41,8 @@ namespace tensorflow {
 namespace port {
 
 ScopedFlushDenormal::ScopedFlushDenormal() {
-// For now, we flush denormals only on SSE 3.  Other architectures such as ARM
-// can be added as needed.
+  // For now, we flush denormals only on SSE 3.  Other architectures such as ARM
+  // can be added as needed.
 
 #ifdef DENORM_USE_INTRINSICS
   if (TestCPUFeature(SSE3)) {
diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
index c0c08dabacb..89f14e905af 100644
--- a/tensorflow/core/platform/device_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -77,7 +77,8 @@ class DeviceTracerTest : public ::testing::Test {
 
     Node* y_neg = test::graph::Unary(&graph, "Neg", i);
     y_neg_ = y_neg->name();
-    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
+    y_neg->set_assigned_device_name(
+        "/job:localhost/replica:0/task:0/device:GPU:0");
 
     test::graph::ToGraphDef(&graph, &def_);
   }
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 557bfa87e50..34aaf3f78ba 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -286,7 +286,7 @@ class Env {
   // "version" should be the version of the library or NULL
   // returns the name that LoadLibrary() can use
   virtual string FormatLibraryFileName(const string& name,
-      const string& version) = 0;
+                                       const string& version) = 0;
 
  private:
   // Returns a possible list of local temporary directories.
@@ -353,6 +353,7 @@ class EnvWrapper : public Env {
                                const string& version) override {
     return target_->FormatLibraryFileName(name, version);
   }
+
  private:
   Env* target_;
 };
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 14755891fa2..b9866cf641a 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -131,18 +131,19 @@ Status FileSystem::GetMatchingPaths(const string& pattern,
     if (children.empty()) continue;
     // This IsDirectory call can be expensive for some FS. Parallelizing it.
     children_dir_status.resize(children.size());
-    ForEach(0, children.size(), [this, &current_dir, &children, &fixed_prefix,
-                                 &children_dir_status](int i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // In case the child_path doesn't start with the fixed_prefix then
-      // we don't need to explore this path.
-      if (!StringPiece(child_path).starts_with(fixed_prefix)) {
-        children_dir_status[i] =
-            Status(tensorflow::error::CANCELLED, "Operation not needed");
-      } else {
-        children_dir_status[i] = IsDirectory(child_path);
-      }
-    });
+    ForEach(0, children.size(),
+            [this, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!StringPiece(child_path).starts_with(fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = IsDirectory(child_path);
+              }
+            });
     for (int i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
index 9c72d34ff51..ab095a35c93 100644
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/gif.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <gif_lib.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 0baeac09841..74863293a32 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -164,8 +164,9 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   } else {
     hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
   }
-  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is the build in
-  // environment variable of Kerberos, so KERB_TICKET_CACHE_PATH and related code are unnecessary.
+  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is
+  // the build in environment variable of Kerberos, so KERB_TICKET_CACHE_PATH
+  // and related code are unnecessary.
   char* ticket_cache_path = getenv("KERB_TICKET_CACHE_PATH");
   if (ticket_cache_path != nullptr) {
     hdfs_->hdfsBuilderSetKerbTicketCachePath(builder, ticket_cache_path);
diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h
index edbcbd960a7..1b5e633f0aa 100644
--- a/tensorflow/core/platform/jpeg.h
+++ b/tensorflow/core/platform/jpeg.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/jpeg.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index 5b0203c343e..dad18d72195 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/png.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <png.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index cda6d7d8f9d..2bb9443fb3c 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -73,19 +73,19 @@ error::Code ErrnoToCode(int err_number) {
     case ECHILD:      // No child processes
     case EISCONN:     // Socket is connected
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case ENOTBLK:     // Block device required
+    case ENOTBLK:  // Block device required
 #endif
-    case ENOTCONN:    // The socket is not connected
-    case EPIPE:       // Broken pipe
+    case ENOTCONN:  // The socket is not connected
+    case EPIPE:     // Broken pipe
 #if !defined(_WIN32)
-    case ESHUTDOWN:   // Cannot send after transport endpoint shutdown
+    case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
 #endif
-    case ETXTBSY:     // Text file busy
+    case ETXTBSY:  // Text file busy
       code = error::FAILED_PRECONDITION;
       break;
-    case ENOSPC:   // No space left on device
+    case ENOSPC:  // No space left on device
 #if !defined(_WIN32)
-    case EDQUOT:   // Disk quota exceeded
+    case EDQUOT:  // Disk quota exceeded
 #endif
     case EMFILE:   // Too many open files
     case EMLINK:   // Too many links
@@ -95,7 +95,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMEM:   // Not enough space
     case ENOSR:    // No STREAM resources
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EUSERS:   // Too many users
+    case EUSERS:  // Too many users
 #endif
       code = error::RESOURCE_EXHAUSTED;
       break;
@@ -104,17 +104,17 @@ error::Code ErrnoToCode(int err_number) {
     case ERANGE:     // Result too large
       code = error::OUT_OF_RANGE;
       break;
-    case ENOSYS:           // Function not implemented
-    case ENOTSUP:          // Operation not supported
-    case EAFNOSUPPORT:     // Address family not supported
+    case ENOSYS:        // Function not implemented
+    case ENOTSUP:       // Operation not supported
+    case EAFNOSUPPORT:  // Address family not supported
 #if !defined(_WIN32)
-    case EPFNOSUPPORT:     // Protocol family not supported
+    case EPFNOSUPPORT:  // Protocol family not supported
 #endif
     case EPROTONOSUPPORT:  // Protocol not supported
 #if !defined(_WIN32) && !defined(__HAIKU__)
     case ESOCKTNOSUPPORT:  // Socket type not supported
 #endif
-    case EXDEV:            // Improper link
+    case EXDEV:  // Improper link
       code = error::UNIMPLEMENTED;
       break;
     case EAGAIN:        // Resource temporarily unavailable
@@ -123,7 +123,7 @@ error::Code ErrnoToCode(int err_number) {
     case ECONNRESET:    // Connection reset
     case EINTR:         // Interrupted function call
 #if !defined(_WIN32)
-    case EHOSTDOWN:     // Host is down
+    case EHOSTDOWN:  // Host is down
 #endif
     case EHOSTUNREACH:  // Host is unreachable
     case ENETDOWN:      // Network is down
@@ -139,7 +139,7 @@ error::Code ErrnoToCode(int err_number) {
       break;
     case EDEADLK:  // Resource deadlock avoided
 #if !defined(_WIN32)
-    case ESTALE:   // Stale file handle
+    case ESTALE:  // Stale file handle
 #endif
       code = error::ABORTED;
       break;
@@ -158,7 +158,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMSG:       // No message of the desired type
     case EPROTO:       // Protocol error
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EREMOTE:      // Object is remote
+    case EREMOTE:  // Object is remote
 #endif
       code = error::UNKNOWN;
       break;
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 8604b01c53e..ce2069b0044 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -58,8 +58,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(AndroidArmV7ACpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
         // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index d3362690d7e..02de7d1362b 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -28,15 +28,17 @@ namespace profile_utils {
 
 static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
 
-#if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-   /* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
-     static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+/* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
+  static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #else
-   /* static */ int64 CpuUtils::GetCycleCounterFrequency() {
-     static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+/* static */ int64 CpuUtils::GetCycleCounterFrequency() {
+  static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #endif
 
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index e95843b80a5..7b580c8bf60 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -94,14 +94,16 @@ class CpuUtils {
 #endif
   }
 
-  // Return cycle counter frequency.
-  // As this method caches the cpu frequency internally,
-  // the first call will incur overhead, but not subsequent calls.
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     static uint64 GetCycleCounterFrequency();
-  #else
-     static int64 GetCycleCounterFrequency();
-  #endif
+// Return cycle counter frequency.
+// As this method caches the cpu frequency internally,
+// the first call will incur overhead, but not subsequent calls.
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  static uint64 GetCycleCounterFrequency();
+#else
+  static int64 GetCycleCounterFrequency();
+#endif
 
   // Return micro second per each clock
   // As this method caches the cpu frequency internally,
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index 5b11b684dd9..eb8161fbfd5 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -53,15 +53,17 @@ TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
 }
 
 TEST_F(CpuUtilsTest, CheckCycleCounterFrequency) {
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
-  #else
-     const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
-  #endif
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
+#else
+  const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
+#endif
   if (DBG) {
     LOG(INFO) << "Cpu frequency = " << cpu_frequency;
   }
diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
index 51c54d50d1d..11b739c0096 100644
--- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -47,7 +47,7 @@ class ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(ICpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__
diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h
index 7d6e8f57a62..2f151a5aee6 100644
--- a/tensorflow/core/platform/protobuf_internal.h
+++ b/tensorflow/core/platform/protobuf_internal.h
@@ -45,8 +45,8 @@ Status ParseAny(const google::protobuf::Any& any, T* message,
 #ifdef TENSORFLOW_LITE_PROTOS
   if (any.type_url() != strings::StrCat("type.googleapis.com/", type_name)) {
     return errors::FailedPrecondition(
-        "Expected Any type_url for: ", type_name, ". Got: ",
-        string(any.type_url().data(), any.type_url().size()), ".");
+        "Expected Any type_url for: ", type_name,
+        ". Got: ", string(any.type_url().data(), any.type_url().size()), ".");
   }
   if (!message->ParseFromString(any.value())) {
     return errors::FailedPrecondition("Failed to unpack: ",
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index fbca0acc36b..44317f1a3e4 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -96,7 +96,7 @@ Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
 
   return log_level;
 }
-}
+}  // namespace
 
 static bool initialized = false;
 static mutex s3_logging_mutex(LINKER_INITIALIZED);
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 1e89fa77c10..4862fd85be0 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -306,8 +306,15 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
     };
     Aws::InitAPI(options);
 
-    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(
-        new Aws::S3::S3Client(GetDefaultClientConfig()));
+    // The creation of S3Client disables virtual addressing:
+    //   S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true)
+    // The purpose is to address the issue encountered when there is an `.`
+    // in the bucket name. Due to TLS hostname validation or DNS rules,
+    // the bucket may not be resolved. Disabling of virtual addressing
+    // should address the issue. See GitHub issue 16397 for details.
+    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client(
+        GetDefaultClientConfig(),
+        Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false));
   }
 
   return this->s3_client_;
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 168b8007f3b..8177e48dba5 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -57,6 +57,18 @@ class S3FileSystem : public FileSystem {
   Status RenameFile(const string& src, const string& target) override;
  private:
   // Returns the member S3 client, initializing as-needed.
+  // When the client tries to access the object in S3, e.g.,
+  //   s3://bucket-name/path/to/object
+  // the behavior could be controlled by various environmental
+  // variables.
+  // By default S3 access regional endpoint, with region
+  // controlled by `AWS_REGION`. The endpoint could be overridden
+  // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default.
+  // If S3_USE_HTTPS=0 is specified, HTTP is used. Also,
+  // S3_VERIFY_SSL=0 could disable SSL verification in case
+  // HTTPS is used.
+  // This S3 Client does not support Virtual Hosted–Style Method
+  // for a bucket.
   std::shared_ptr<Aws::S3::S3Client> GetS3Client();
 
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 0c66da09bb9..592626bfa17 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
-
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h
index a6636225ccb..327237dba93 100644
--- a/tensorflow/core/platform/test_benchmark.h
+++ b/tensorflow/core/platform/test_benchmark.h
@@ -60,7 +60,7 @@ class Benchmark {
  private:
   string name_;
   int num_args_;
-  std::vector<std::pair<int, int>> args_;
+  std::vector<std::pair<int, int> > args_;
   void (*fn0_)(int) = nullptr;
   void (*fn1_)(int, int) = nullptr;
   void (*fn2_)(int, int, int) = nullptr;
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index d6e78dbc8f9..f20939d3c0f 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -22,8 +22,10 @@ limitations under the License.
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
 #define __ORDER_LITTLE_ENDIAN__ 0x4d2
 #define __ORDER_BIG_ENDIAN__ 0x10e1
 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
 
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 788a4bf4b1a..41b26441707 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #undef LoadLibrary
 #undef ERROR
 
+#include <string>
 #include <thread>
 #include <vector>
-#include <string>
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
@@ -53,8 +53,7 @@ class StdThread : public Thread {
 
 class WindowsEnv : public Env {
  public:
-  WindowsEnv()
-      : GetSystemTimePreciseAsFileTime_(NULL) {
+  WindowsEnv() : GetSystemTimePreciseAsFileTime_(NULL) {
     // GetSystemTimePreciseAsFileTime function is only available in the latest
     // versions of Windows. For that reason, we try to look it up in
     // kernel32.dll at runtime and use an alternative option if the function
@@ -72,8 +71,8 @@ class WindowsEnv : public Env {
   }
 
   bool MatchPath(const string& path, const string& pattern) override {
-      std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
-      std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
+    std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
+    std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
     return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
   }
 
@@ -122,14 +121,14 @@ class WindowsEnv : public Env {
     SetThreadpoolTimer(timer, &FileDueTime, 0, 0);
   }
 
-  Status LoadLibrary(const char *library_filename, void** handle) override {
+  Status LoadLibrary(const char* library_filename, void** handle) override {
     std::string file_name = library_filename;
     std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
     std::wstring ws_file_name(WindowsFileSystem::Utf8ToWideChar(file_name));
 
     HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL,
-      LOAD_WITH_ALTERED_SEARCH_PATH);
+                                     LOAD_WITH_ALTERED_SEARCH_PATH);
     if (!hModule) {
       return errors::NotFound(file_name + " not found");
     }
@@ -138,31 +137,30 @@ class WindowsEnv : public Env {
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-    void** symbol) override {
+                              void** symbol) override {
     FARPROC found_symbol;
 
     found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
     if (found_symbol == NULL) {
       return errors::NotFound(std::string(symbol_name) + " not found");
     }
-    *symbol = (void **)found_symbol;
+    *symbol = (void**)found_symbol;
     return Status::OK();
   }
 
-  string FormatLibraryFileName(const string& name, const string& version)
-    override {
+  string FormatLibraryFileName(const string& name,
+                               const string& version) override {
     string filename;
     if (version.size() == 0) {
       filename = name + ".dll";
-    }
-    else {
+    } else {
       filename = name + version + ".dll";
     }
     return filename;
   }
 
  private:
-  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
+  typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
diff --git a/tensorflow/core/platform/windows/error.cc b/tensorflow/core/platform/windows/error.cc
index 39e941a3834..291fc5003fb 100644
--- a/tensorflow/core/platform/windows/error.cc
+++ b/tensorflow/core/platform/windows/error.cc
@@ -21,7 +21,7 @@ namespace internal {
 std::string GetWindowsErrorMessage(DWORD err) {
   LPSTR buffer = NULL;
   DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-      FORMAT_MESSAGE_IGNORE_INSERTS;
+                FORMAT_MESSAGE_IGNORE_INSERTS;
   FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                  reinterpret_cast<LPSTR>(&buffer), 0, NULL);
   std::string message = buffer;
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
index 026e0d5aa94..ba643a0fa8f 100644
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@@ -24,9 +24,7 @@ namespace tensorflow {
 namespace internal {
 
 std::string GetWindowsErrorMessage(DWORD err);
-
-}
 }
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
-
diff --git a/tensorflow/core/platform/windows/integral_types.h b/tensorflow/core/platform/windows/integral_types.h
index 4970b8ca6a1..46338a536db 100644
--- a/tensorflow/core/platform/windows/integral_types.h
+++ b/tensorflow/core/platform/windows/integral_types.h
@@ -1,18 +1,18 @@
- /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- 
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 #ifndef TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
 #define TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
 
diff --git a/tensorflow/core/platform/windows/net.cc b/tensorflow/core/platform/windows/net.cc
index 46eb072d425..2ab558ab95c 100644
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #undef ERROR
 
-#pragma comment(lib,"Ws2_32.lib")
+#pragma comment(lib, "Ws2_32.lib")
 
 namespace tensorflow {
 namespace internal {
@@ -44,8 +44,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   CHECK_GE(*port, 0);
   CHECK_LE(*port, 65535);
   if (sock == INVALID_SOCKET) {
-    LOG(ERROR) << "socket() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "socket() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     return false;
   }
 
@@ -54,8 +54,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                           reinterpret_cast<const char*>(&one), sizeof(one));
   if (result == SOCKET_ERROR) {
-    LOG(ERROR) << "setsockopt() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "setsockopt() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -66,8 +66,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   addr.sin_port = htons((uint16_t)*port);
   result = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "bind(port=" << *port
+                 << ") failed: " << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -75,8 +75,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   // Get the bound port number.
   result = getsockname(sock, (struct sockaddr*)&addr, &addr_len);
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "getsockname() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "getsockname() failed: "
+                 << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index b65313363ed..66ec44885d5 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -19,8 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 // SubProcess is not yet implemented for Windows.
-class SubProcess {
-};
+class SubProcess {};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc
index 0ffd02ff148..584acad91b2 100644
--- a/tensorflow/core/platform/windows/test.cc
+++ b/tensorflow/core/platform/windows/test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/net.h"
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 604348fe03a..b6b3722caae 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <Shlwapi.h>
 #include <Windows.h>
 #include <direct.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <io.h>
-#include <Shlwapi.h>
 #undef StrCat
 #include <stdio.h>
 #include <sys/stat.h>
@@ -75,16 +75,16 @@ SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) {
   if (TRUE == read_result) {
     result = bytes_read;
   } else if ((FALSE == read_result) &&
-      ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
+             ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
     result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1;
   } else {
-    if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result.
-      BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped,
-                                                     &bytes_read, TRUE);
+    if (ERROR_IO_PENDING ==
+        last_error) {  // Otherwise bytes_read already has the result.
+      BOOL overlapped_result =
+          ::GetOverlappedResult(hfile, &overlapped, &bytes_read, TRUE);
       if (FALSE == overlapped_result) {
         result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1;
-      }
-      else {
+      } else {
         result = bytes_read;
       }
     }
@@ -151,11 +151,11 @@ class WindowsWritableFile : public WritableFile {
   Status Append(const StringPiece& data) override {
     DWORD bytes_written = 0;
     DWORD data_size = static_cast<DWORD>(data.size());
-    BOOL write_result = ::WriteFile(hfile_, data.data(), data_size,
-                                    &bytes_written, NULL);
+    BOOL write_result =
+        ::WriteFile(hfile_, data.data(), data_size, &bytes_written, NULL);
     if (FALSE == write_result) {
-      return IOErrorFromWindowsError(
-          "Failed to WriteFile: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("Failed to WriteFile: " + filename_,
+                                     ::GetLastError());
     }
 
     assert(size_t(bytes_written) == data.size());
@@ -171,8 +171,8 @@ class WindowsWritableFile : public WritableFile {
     }
 
     if (FALSE == ::CloseHandle(hfile_)) {
-      return IOErrorFromWindowsError(
-          "CloseHandle failed for: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("CloseHandle failed for: " + filename_,
+                                     ::GetLastError());
     }
 
     hfile_ = INVALID_HANDLE_VALUE;
@@ -187,9 +187,7 @@ class WindowsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status Sync() override {
-    return Flush();
-  }
+  Status Sync() override { return Flush(); }
 };
 
 class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -204,7 +202,10 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
   WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile,
                           HANDLE hmap, const void* address, uint64 length)
-      : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address),
+      : filename_(filename),
+        hfile_(hfile),
+        hmap_(hmap),
+        address_(address),
         length_(length) {}
 
   ~WinReadOnlyMemoryRegion() {
@@ -238,9 +239,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
 
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "NewRandomAccessFile failed to Create/Open: " + fname;
@@ -258,9 +259,9 @@ Status WindowsFileSystem::NewWritableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, CREATE_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewWriteableFile: " + fname;
@@ -278,9 +279,9 @@ Status WindowsFileSystem::NewAppendableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, OPEN_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewAppendableFile: " + fname;
@@ -316,9 +317,9 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   file_flags |= FILE_FLAG_OVERLAPPED;
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     return IOErrorFromWindowsError(
@@ -345,28 +346,32 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
                                        NULL);  // Mapping name
 
     if (!hmap) {
-      string context = "Failed to create file mapping for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to create file mapping for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
     UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc);
 
-    const void* mapped_region = ::MapViewOfFileEx(
-        hmap, FILE_MAP_READ,
-        0,  // High DWORD of access start
-        0,  // Low DWORD
-        file_size,
-        NULL);  // Let the OS choose the mapping
+    const void* mapped_region =
+        ::MapViewOfFileEx(hmap, FILE_MAP_READ,
+                          0,  // High DWORD of access start
+                          0,  // Low DWORD
+                          file_size,
+                          NULL);  // Let the OS choose the mapping
 
     if (!mapped_region) {
-      string context = "Failed to MapViewOfFile for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to MapViewOfFile for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
-    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap,
-                                              mapped_region, file_size));
+    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap, mapped_region,
+                                              file_size));
 
     map_guard.release();
     file_guard.release();
@@ -404,8 +409,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
   }
 
   do {
-	string file_name = WideCharToUtf8(find_data.cFileName);
-	const StringPiece basename = file_name;
+    string file_name = WideCharToUtf8(find_data.cFileName);
+    const StringPiece basename = file_name;
     if (basename != "." && basename != "..") {
       result->push_back(file_name);
     }
@@ -457,8 +462,7 @@ Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
     file_size.HighPart = attrs.nFileSizeHigh;
     file_size.LowPart = attrs.nFileSizeLow;
     *size = file_size.QuadPart;
-  }
-  else {
+  } else {
     string context = "Can not get size for: " + fname;
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
@@ -472,7 +476,7 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
   std::wstring ws_translated_src = Utf8ToWideChar(TranslateName(src));
   std::wstring ws_translated_target = Utf8ToWideChar(TranslateName(target));
   if (!::MoveFileExW(ws_translated_src.c_str(), ws_translated_target.c_str(),
-      MOVEFILE_REPLACE_EXISTING)) {
+                     MOVEFILE_REPLACE_EXISTING)) {
     string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 8dcc1530370..ba0302f0fd8 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -63,33 +63,35 @@ class WindowsFileSystem : public FileSystem {
 
   Status RenameFile(const string& src, const string& target) override;
 
-  string TranslateName(const string& name) const override {
-    return name;
-  }
+  string TranslateName(const string& name) const override { return name; }
 
   static std::wstring Utf8ToWideChar(const string& utf8str) {
-      int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), NULL, 0);
-      std::wstring ws_translated_str(size_required, 0);
-      MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), &ws_translated_str[0], size_required);
-      return ws_translated_str;
+    int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(),
+                                            (int)utf8str.size(), NULL, 0);
+    std::wstring ws_translated_str(size_required, 0);
+    MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(),
+                        &ws_translated_str[0], size_required);
+    return ws_translated_str;
   }
 
-  static string WideCharToUtf8(const std::wstring &wstr) {
-      if (wstr.empty()) return std::string();
-      int size_required = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
-      string utf8_translated_str(size_required, 0);
-      WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), &utf8_translated_str[0], size_required, NULL, NULL);
-      return utf8_translated_str;
+  static string WideCharToUtf8(const std::wstring& wstr) {
+    if (wstr.empty()) return std::string();
+    int size_required = WideCharToMultiByte(
+        CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
+    string utf8_translated_str(size_required, 0);
+    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(),
+                        &utf8_translated_str[0], size_required, NULL, NULL);
+    return utf8_translated_str;
   }
 };
 
 class LocalWinFileSystem : public WindowsFileSystem {
-public:
-    string TranslateName(const string& name) const override {
-      StringPiece scheme, host, path;
-      io::ParseURI(name, &scheme, &host, &path);
-      return path.ToString();
-    }
+ public:
+  string TranslateName(const string& name) const override {
+    StringPiece scheme, host, path;
+    io::ParseURI(name, &scheme, &host, &path);
+    return path.ToString();
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 9e628b10651..57d76eb4cb9 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 # can also generate memory profile using `-select bytes`
 tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename>  -trim_name_regexes .*apply_op.*
 
-# Use pprof to visualize the generated file.
-pprof -png --nodecount=100 --sample_index=1 <filename>
+# Use google-pprof, from the google-perftools package to visualize the generated file.
+# On Ubuntu you can install it with `apt-get install it google-perftools`.
+google-pprof --pdf --nodecount=100 <filename>
 ```
 
 ![PprofGraph](g3doc/pprof.jpg)
@@ -256,7 +257,7 @@ bug fix. `OpLogProto` is a good plus if it is used.
 
 #### Teams
 
-* Xin Pan (xpan@google.com, github: panyx0718)
+* Xin Pan
 * Chris Antaki
 * Yao Zhang
 * Jon Shlens
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index d05143aff9b..e968b9c97e2 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -53,10 +53,13 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0",
                       node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+                      node_stat);
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
                       node_stat);
     return node;
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index 5a8429d4893..3dce1d85db3 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -113,8 +113,9 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     root_->formatted_str = FormatNode(root_.get(), root_.get(), opts);
   }
   if (timeline) {
-    fprintf(stderr, "op view doesn't support timeline yet. "
-                    "Consider graph/scope/code view.\n");
+    fprintf(stderr,
+            "op view doesn't support timeline yet. "
+            "Consider graph/scope/code view.\n");
     return root_.get();
   }
   if (cnodes_map_.empty()) {
@@ -265,9 +266,9 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_parameters() > 0) {
       accu_pct = 100.0 * node->proto().total_parameters() /
-          root->proto().total_parameters();
-      pct = 100.0 * node->proto().parameters() /
-          root->proto().total_parameters();
+                 root->proto().total_parameters();
+      pct =
+          100.0 * node->proto().parameters() / root->proto().total_parameters();
     }
     attrs.push_back(strings::Printf(
         "%30s",
@@ -282,9 +283,8 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_float_ops() > 0) {
       accu_pct = 100.0 * node->proto().total_float_ops() /
-          root->proto().total_float_ops();
-      pct = 100.0 * node->proto().float_ops() /
-          root->proto().total_float_ops();
+                 root->proto().total_float_ops();
+      pct = 100.0 * node->proto().float_ops() / root->proto().total_float_ops();
     }
 
     attrs.push_back(strings::Printf(
diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
index fe1c3b2ae82..aa22182d36c 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -41,8 +41,7 @@ namespace tfprof {
 // to input ops.
 class TFOp : public TFMultiShow {
  public:
-  explicit TFOp()
-      : TFMultiShow() {}
+  explicit TFOp() : TFMultiShow() {}
   ~TFOp() override {}
 
   void AddNode(TFGraphNode* node) override;
@@ -51,7 +50,7 @@ class TFOp : public TFMultiShow {
 
  private:
   const ShowMultiNode* ShowInternal(const Options& opts,
-                                   Timeline* timeline) override;
+                                    Timeline* timeline) override;
 
   int64 SearchRoot(const std::vector<OpNode*> nodes,
                    const std::vector<string>& regexes);
diff --git a/tensorflow/core/profiler/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h
index 4d6de060705..81b021549a4 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_show.h
@@ -78,40 +78,43 @@ class TFShow {
       return nodes;
     }
     std::vector<T*> sorted_nodes = nodes;
-    std::sort(sorted_nodes.begin(), sorted_nodes.end(), [&opts](const T* n1,
-                                                                const T* n2) {
-      if (n1->name() == kTFProfRoot) return true;
-      if (n2->name() == kTFProfRoot) return false;
-      bool name_cmp = n1->name() < n2->name();
-      if (opts.order_by == kOrderBy[0]) {
-        return name_cmp;
-      } else if (opts.order_by == kOrderBy[1]) {
-        return n1->proto().total_requested_bytes() >
-               n2->proto().total_requested_bytes();
-      } else if (opts.order_by == kOrderBy[2]) {
-        return n1->proto().total_peak_bytes() > n2->proto().total_peak_bytes();
-      } else if (opts.order_by == kOrderBy[3]) {
-        return n1->proto().total_residual_bytes() >
-               n2->proto().total_residual_bytes();
-      } else if (opts.order_by == kOrderBy[4]) {
-        return n1->proto().total_output_bytes() >
-               n2->proto().total_output_bytes();
-      } else if (opts.order_by == kOrderBy[5]) {
-        return n1->proto().total_exec_micros() >
-               n2->proto().total_exec_micros();
-      } else if (opts.order_by == kOrderBy[6]) {
-        return n1->proto().total_accelerator_exec_micros() >
-               n2->proto().total_accelerator_exec_micros();
-      } else if (opts.order_by == kOrderBy[7]) {
-        return n1->proto().total_cpu_exec_micros() >
-               n2->proto().total_cpu_exec_micros();
-      } else if (opts.order_by == kOrderBy[8]) {
-        return n1->proto().total_parameters() > n2->proto().total_parameters();
-      } else if (opts.order_by == kOrderBy[9]) {
-        return n1->proto().total_float_ops() > n2->proto().total_float_ops();
-      }
-      return name_cmp;
-    });
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
+              [&opts](const T* n1, const T* n2) {
+                if (n1->name() == kTFProfRoot) return true;
+                if (n2->name() == kTFProfRoot) return false;
+                bool name_cmp = n1->name() < n2->name();
+                if (opts.order_by == kOrderBy[0]) {
+                  return name_cmp;
+                } else if (opts.order_by == kOrderBy[1]) {
+                  return n1->proto().total_requested_bytes() >
+                         n2->proto().total_requested_bytes();
+                } else if (opts.order_by == kOrderBy[2]) {
+                  return n1->proto().total_peak_bytes() >
+                         n2->proto().total_peak_bytes();
+                } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_residual_bytes() >
+                         n2->proto().total_residual_bytes();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_output_bytes() >
+                         n2->proto().total_output_bytes();
+                } else if (opts.order_by == kOrderBy[5]) {
+                  return n1->proto().total_exec_micros() >
+                         n2->proto().total_exec_micros();
+                } else if (opts.order_by == kOrderBy[6]) {
+                  return n1->proto().total_accelerator_exec_micros() >
+                         n2->proto().total_accelerator_exec_micros();
+                } else if (opts.order_by == kOrderBy[7]) {
+                  return n1->proto().total_cpu_exec_micros() >
+                         n2->proto().total_cpu_exec_micros();
+                } else if (opts.order_by == kOrderBy[8]) {
+                  return n1->proto().total_parameters() >
+                         n2->proto().total_parameters();
+                } else if (opts.order_by == kOrderBy[9]) {
+                  return n1->proto().total_float_ops() >
+                         n2->proto().total_float_ops();
+                }
+                return name_cmp;
+              });
     return sorted_nodes;
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h
index 2a2208d8e78..711d35f9753 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.h
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -50,7 +50,7 @@ class TFMultiShow {
 
  protected:
   virtual const ShowMultiNode* ShowInternal(const Options& opts,
-                                           Timeline* timeline) = 0;
+                                            Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 651ad3f0c1c..baf3fb2bedb 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 2cc212d5898..808e3c853be 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -206,8 +206,12 @@ int Run(int argc, char** argv) {
         "graph_path,op_log_path,run_meta_path\n");
     std::unique_ptr<GraphDef> graph(new GraphDef());
     if (!FLAGS_graph_path.empty()) {
-      TF_CHECK_OK(
-          ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
+      s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to read graph_path: %s\n",
+                s.ToString().c_str());
+        return 1;
+      }
     }
 
     std::unique_ptr<OpLogProto> op_log(new OpLogProto());
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 67da7bf4526..b02f899b87d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc
index 1eab7e3d024..3a5f1f83af8 100644
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@@ -69,9 +69,9 @@ BCast::BCast(const Vec& sx, const Vec& sy, const bool fewer_dims_optimization) {
       State curr = UNKNOWN;
       const int64 x_i = x[i];  // i-th dimension of x.
       const int64 y_i = y[i];  // i-th dimension of y.
-      int64 o_i;   // i-th dimension of the output.
-      int64 bx_i;  // i-th broadcast for x.
-      int64 by_i;  // i-th broadcast for y.
+      int64 o_i;               // i-th dimension of the output.
+      int64 bx_i;              // i-th broadcast for x.
+      int64 by_i;              // i-th broadcast for y.
       // Invariant:
       //   o_i = x_i * bx_i = y_i * by_i
       if (x_i == y_i) {
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index be00895b0d3..dd1163310bf 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -130,13 +130,13 @@ Status CTCLossCalculator::CalculateLoss(
   for (int t = 1; t < num_time_steps; ++t) {
     if (inputs[t].rows() != batch_size) {
       return errors::InvalidArgument("Expected batch size at t: ", t,
-                                     " to be: ", batch_size, " but got: ",
-                                     inputs[t].rows());
+                                     " to be: ", batch_size,
+                                     " but got: ", inputs[t].rows());
     }
     if (inputs[t].cols() != num_classes) {
       return errors::InvalidArgument("Expected class count at t: ", t,
-                                     " to be: ", num_classes, " but got: ",
-                                     inputs[t].cols());
+                                     " to be: ", num_classes,
+                                     " but got: ", inputs[t].cols());
     }
   }
 
@@ -282,8 +282,8 @@ Status CTCLossCalculator::PopulateLPrimes(
     LabelSequences* l_primes) const {
   // labels is a Label array of size batch_size
   if (labels.size() != batch_size) {
-    return errors::InvalidArgument("labels.size() != batch_size: ",
-                                   labels.size(), " vs. ", batch_size);
+    return errors::InvalidArgument(
+        "labels.size() != batch_size: ", labels.size(), " vs. ", batch_size);
   }
 
   *max_u_prime = 0;  // keep track of longest l' modified label sequence.
@@ -325,12 +325,13 @@ Status CTCLossCalculator::PopulateLPrimes(
     for (int l_i : l) {
       if (l_i < 0) {
         return errors::InvalidArgument(
-            "All labels must be nonnegative integers, batch: ", b, " labels: ",
-            str_util::Join(l, ","));
+            "All labels must be nonnegative integers, batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       } else if (l_i >= num_classes) {
         return errors::InvalidArgument(
-            "No label may be greater than num_classes. ", "num_classes: ",
-            num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ","));
+            "No label may be greater than num_classes. ",
+            "num_classes: ", num_classes, ", batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       }
     }
     if (!ignore_longer_outputs_than_inputs) {
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
index f787687f662..525bef16a02 100644
--- a/tensorflow/core/util/cuda_device_functions.h
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -29,13 +29,8 @@ limitations under the License.
 #include <algorithm>
 #include <complex>
 #include "cuda/include/cuda.h"
-#include "cuda/include/device_functions.h"
 #include "tensorflow/core/platform/types.h"
 
-#if CUDA_VERSION >= 7050
-#include "cuda/include/cuda_fp16.h"
-#endif  // CUDA_VERSION >= 7050
-
 namespace tensorflow {
 
 namespace detail {
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index bd4c356ea01..732ed33ede1 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -149,27 +149,27 @@ class CudaLaunchConfigTest : public ::testing::Test {
 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   CudaLaunchConfig cfg;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(work_element_count)                             \
-    cfg = GetCudaLaunchConfig(bufsize, d);                                      \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d);                           \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0));\
-                                                                                \
-    cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);            \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(work_element_count)                              \
+  cfg = GetCudaLaunchConfig(bufsize, d);                                       \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d);                            \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
+                                                                               \
+  cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128);
   TEST_LAUNCH_PARAMETER(129);
@@ -181,7 +181,7 @@ TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   TEST_LAUNCH_PARAMETER(8192);
   TEST_LAUNCH_PARAMETER(123456);
   TEST_LAUNCH_PARAMETER(1 << 30);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
@@ -200,27 +200,27 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   Cuda2DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
-    cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0));       \
-                                                                                \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy)                                      \
+  cfg1d = GetCudaLaunchConfig(bufsize, d);                                     \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                  \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
+                                                                               \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                   \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
   TEST_LAUNCH_PARAMETER(129, 64);
@@ -233,24 +233,24 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(123456, 12);
   TEST_LAUNCH_PARAMETER(1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1 << 30, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   Cuda3DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
-    Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy * dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                                \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);             \
+  Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
   TEST_LAUNCH_PARAMETER(129, 64, 1024);
@@ -264,7 +264,7 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
   TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 9b6a8e12511..13e41c17f7c 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -57,6 +57,7 @@ void TestCorrectness(const string& serialized) {
   Example example;
   Example fast_example;
   EXPECT_TRUE(example.ParseFromString(serialized));
+  example.DiscardUnknownFields();
   EXPECT_TRUE(TestFastParse(serialized, &fast_example));
   EXPECT_EQ(example.DebugString(), fast_example.DebugString());
   if (example.DebugString() != fast_example.DebugString()) {
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 41f56d2daa4..e156a3bc8f0 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -247,8 +247,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
@@ -278,8 +279,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 616eb5dac32..504d2d353f8 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -144,8 +144,8 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp));
   // Making sure to clean up after the test finishes.
   const auto adh = [&memmapped_env, &filename](WritableFile* f) {
-      delete f;
-      TF_CHECK_OK(memmapped_env.DeleteFile(filename));
+    delete f;
+    TF_CHECK_OK(memmapped_env.DeleteFile(filename));
   };
   std::unique_ptr<WritableFile, decltype(adh)> writable_file(
       writable_file_temp.release(), adh);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 2caf5fc56da..4467373c006 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::engine;
@@ -210,31 +210,32 @@ class MklShape {
     CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
   }
 
-// The following methods are used for serializing and de-serializing the
-// contents of the mklshape object.
-// The data is serialized in this order
-// isMklTensor_
-// dimension_
-// sizes_
-// strides_
-// mklLayout_
-// tfLayout_
-// tf_to_mkl_dim_map_
+  // The following methods are used for serializing and de-serializing the
+  // contents of the mklshape object.
+  // The data is serialized in this order
+  // isMklTensor_
+  // dimension_
+  // sizes_
+  // strides_
+  // mklLayout_
+  // tfLayout_
+  // tf_to_mkl_dim_map_
 
 #define SIZE_OF_MKL_DNN_BUF \
   (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
                                             // serialize dnn_layout pointer
 
-// Size of buffer to hold the serialized object, the size is computed as follows
-// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_)
-// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
-// + sizeof(tf_to_mkl_dim_map_)
+  // Size of buffer to hold the serialized object, the size is computed as
+  // follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) +
+  // sizeof(strides_)
+  // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+  // + sizeof(tf_to_mkl_dim_map_)
 
 #define SIZE_OF_MKL_SERIAL_DATA(dims) \
   (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
 
-// First we need to define some macro for offsets into the serial buffer where
-// different elements of Mklshape is written/read from
+  // First we need to define some macro for offsets into the serial buffer where
+  // different elements of Mklshape is written/read from
 
 #define IS_MKL_TENSOR_OFFSET 0
 // Location from start of buffer where isMklTensor_ is serialized
@@ -324,7 +325,7 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
@@ -388,7 +389,7 @@ class MklDnnShape {
 
   /// Equality function for MklDnnShape objects
   /// @return true if both are equal; false otherwise.
-  inline bool operator == (const MklDnnShape& input_shape) const {
+  inline bool operator==(const MklDnnShape& input_shape) const {
     if (this->IsMklTensor() != input_shape.IsMklTensor()) {
       return false;
     }
@@ -406,7 +407,7 @@ class MklDnnShape {
 
   /// Equality operator for MklDnnShape and TFShape.
   /// Returns: true if TF shapes for both are the same, false otherwise
-  inline bool operator == (const TensorShape& input_shape) const {
+  inline bool operator==(const TensorShape& input_shape) const {
     if (!this->IsMklTensor()) {
       return false;
     }
@@ -425,7 +426,7 @@ class MklDnnShape {
   inline size_t GetDimension(char dimension) const {
     int index = GetMklDnnTensorDimIndex(dimension);
     CHECK(index >= 0 && index < this->GetDimension())
-      << "Invalid index from the dimension: " << index << ", " << dimension;
+        << "Invalid index from the dimension: " << index << ", " << dimension;
     return this->DimSize(index);
   }
 
@@ -659,7 +660,7 @@ class MklDnnShape {
 
 typedef std::vector<MklShape> MklShapeList;
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 typedef std::vector<MklDnnShape> MklDnnShapeList;
 #endif
 
@@ -673,7 +674,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -705,8 +706,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   Tensor output_tensor;
   TensorShape output_shape;
 
-  TF_CHECK_OK(Status(error::Code::UNIMPLEMENTED,
-                     "Unimplemented conversion function"));
+  TF_CHECK_OK(
+      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
 
   return output_tensor;
 }
@@ -724,7 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
           sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
   mklshape->DeSerializeMklDnnShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -749,7 +750,7 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
 }
 
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
@@ -779,7 +780,7 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
 
 #endif
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
@@ -814,7 +815,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -851,7 +852,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -875,7 +876,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            const memory::primitive_desc& pd, void** buf_out) {
@@ -973,8 +974,8 @@ inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   return mkl_shape.dim_size(index);
 }
 
-inline void CopyMklTensorInToOut(OpKernelContext* context,
-                                 int idx_in, int idx_out) {
+inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                 int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -994,9 +995,9 @@ inline void CopyMklTensorInToOut(OpKernelContext* context,
   context->set_output(idx_meta_out, meta_output);
 }
 
-#ifndef INTEL_MKL_DNN
-inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
-                                         int idx_in, int idx_out,
+#ifdef INTEL_MKL_ML
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
                                          const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
@@ -1013,8 +1014,8 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
   context->set_output(idx_data_out, output);
 }
 #else
-inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
-                                         int idx_in, int idx_out,
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
                                          const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
@@ -1032,10 +1033,10 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
 }
 #endif
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
-inline void ForwardTfTensorInToOut(OpKernelContext* context,
-                                  int idx_in, int idx_out) {
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1053,8 +1054,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
 
 #else
 
-inline void ForwardTfTensorInToOut(OpKernelContext* context,
-                                  int idx_in, int idx_out) {
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1072,8 +1073,8 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
 
 #endif
 
-inline void ForwardMklTensorInToOut(OpKernelContext* context,
-                                   int idx_in, int idx_out) {
+inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                    int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1090,10 +1091,10 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context,
   }
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
-                                             int idx_in, int idx_out,
-                                             const MklDnnShape& mkl_shape) {
+                                                int idx_in, int idx_out,
+                                                const MklDnnShape& mkl_shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -1132,7 +1133,7 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 // We don't need these functions in MKLDNN. We have defined equality operator
 // on MklDnnShape class directly.
 
@@ -1216,11 +1217,11 @@ inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
   int64 H = input.dim_size(1);
   int64 W = input.dim_size(2);
   int64 C = input.dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C,
-        buf_out + n*stride_n, H*W);
+    mkl_somatcopy('R', 'T', H * W, C, 1, buf_in + n * stride_n, C,
+                  buf_out + n * stride_n, H * W);
   }
 }
 
@@ -1232,17 +1233,17 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
   int64 H = (*output)->dim_size(1);
   int64 W = (*output)->dim_size(2);
   int64 C = (*output)->dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W,
-        buf_out + n*stride_n, C);
+    mkl_somatcopy('R', 'T', C, H * W, 1, buf_in + n * stride_n, H * W,
+                  buf_out + n * stride_n, C);
   }
 }
 
 // -------------------------------------------------------------------
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
@@ -1279,10 +1280,11 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @return: Tensorflow data format corresponding to memory::format
 ///          Fails with an error if invalid data format.
 inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
-  if (format == memory::format::nhwc) return FORMAT_NHWC;
-  else if (format == memory::format::nchw) return FORMAT_NCHW;
-  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
-                     "Unsupported data format"));
+  if (format == memory::format::nhwc)
+    return FORMAT_NHWC;
+  else if (format == memory::format::nchw)
+    return FORMAT_NCHW;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
   // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
   // that we don't come here.
@@ -1425,7 +1427,6 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
   return memory::desc(md);
 }
 
-
 /*
  * Class to represent all the resources corresponding to a tensor in TensorFlow
  * that are required to execute an operation (such as Convolution).
@@ -1494,7 +1495,7 @@ class MklDnnData {
   /// @return: memory::desc object corresponding to blocked memory format
   ///          for given dimensions and strides.
   static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
-                                                 const memory::dims& strides) {
+                                                  const memory::dims& strides) {
     return CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>());
   }
 
@@ -1563,7 +1564,6 @@ class MklDnnData {
     return user_memory_->get_primitive_desc();
   }
 
-
   /// Get function for descriptor of user memory.
   inline memory::desc GetUsrMemDesc() {
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
@@ -1634,7 +1634,8 @@ class MklDnnData {
   /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool IsReorderNeeded(const memory::format& target_format) const {
     CHECK_NOTNULL(user_memory_);
-    return target_format != user_memory_->get_primitive_desc().desc().data.format;
+    return target_format !=
+           user_memory_->get_primitive_desc().desc().data.format;
   }
 
   /// Function to create a reorder from memory pointed by from to memory pointed
@@ -1753,7 +1754,7 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 8b73eadb400..cd1d0713ad5 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 TEST(MklUtilTest, MklDnnTfShape) {
   auto cpu_engine = engine(engine::cpu, 0);
@@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index e7dab830f0e..f88ad2faaff 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -67,7 +67,7 @@ inline uint64 multiply_high_u64(uint64 x, uint64 y) {
   return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry;
 #endif
 }
-}
+}  // namespace presized_cuckoo_map
 
 template <class value>
 class PresizedCuckooMap {
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 1cb07718fee..575c27d4ef7 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,8 +29,8 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST(TestReporter, NoLogging) {
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index f2401a0af4e..258ee418c14 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/sparse/dim_comparator.h"
 #include "tensorflow/core/util/sparse/group_iterator.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
@@ -59,8 +59,8 @@ class SparseTensor {
         shape_(shape.begin(), shape.end()),
         order_(order.begin(), order.end()),
         dims_(GetDimsFromIx(ix)) {
-    CHECK_EQ(ix.dtype(), DT_INT64) << "indices must be type int64 but got: "
-                                   << ix.dtype();
+    CHECK_EQ(ix.dtype(), DT_INT64)
+        << "indices must be type int64 but got: " << ix.dtype();
     CHECK(TensorShapeUtils::IsVector(vals.shape()))
         << "vals must be a vec, but got: " << vals.shape().DebugString();
     CHECK_EQ(ix.shape().dim_size(0), vals.shape().dim_size(0))
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index efdd97fd3d6..85de0320857 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index 6a5ddec04c9..f7767ace716 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -41,9 +41,10 @@ class StreamExecutorUtil {
   // This assumes that the error codes between the two implementations
   // match.
   static Status ConvertStatus(const perftools::gputools::port::Status& s) {
-    return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                              static_cast<int>(s.code())),
-                                          s.error_message());
+    return s.ok() ? Status::OK()
+                  : Status(static_cast<tensorflow::error::Code>(
+                               static_cast<int>(s.code())),
+                           s.error_message());
   }
 };
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 579b70ab514..462b420976e 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -913,8 +913,8 @@ Status BundleReader::LookupSlice(StringPiece full_tensor_key,
 Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
                                    const BundleEntryProto& full_tensor_entry,
                                    const TensorSlice& slice_spec, Tensor* val) {
-  using checkpoint::TensorSliceSet;
   using checkpoint::RegisterTensorSlice;
+  using checkpoint::TensorSliceSet;
   DCHECK_GE(full_tensor_entry.slices_size(), 0);
 
   const TensorShape full_shape(TensorShape(full_tensor_entry.shape()));
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index 0f009d7de57..424f8098a9c 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -55,7 +55,7 @@ const TensorSliceReader* TensorSliceReaderCache::GetReader(
     TensorSliceReader::OpenTableFunction open_function, int preferred_shard) {
   mutex_lock l(mu_);
 
-#if defined(__GXX_RTTI) ||  defined(_CPPRTTI)
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
   // Get the function pointer from the open_function value.
   TensorSliceReaderCache::OpenFuncType* func_ptr =
       open_function.target<TensorSliceReaderCache::OpenFuncType>();
diff --git a/tensorflow/core/util/tensor_slice_set.cc b/tensorflow/core/util/tensor_slice_set.cc
index 4217df90ca1..7c1d325c0a5 100644
--- a/tensorflow/core/util/tensor_slice_set.cc
+++ b/tensorflow/core/util/tensor_slice_set.cc
@@ -188,9 +188,9 @@ Status RegisterTensorSlice(
     }
     if (type != tss->type()) {
       return errors::Internal("Incompatible tensor types detected for tensor ",
-                              name, ": existing = ",
-                              DataTypeString(tss->type()), ", new = ",
-                              DataTypeString(type));
+                              name,
+                              ": existing = ", DataTypeString(tss->type()),
+                              ", new = ", DataTypeString(type));
     }
   }
   // Register the tensor slices without the actual data.
diff --git a/tensorflow/core/util/tensor_slice_util.h b/tensorflow/core/util/tensor_slice_util.h
index c7edae66b26..8f5a6f1d935 100644
--- a/tensorflow/core/util/tensor_slice_util.h
+++ b/tensorflow/core/util/tensor_slice_util.h
@@ -139,9 +139,9 @@ static bool CopyDataFromTensorSliceToTensorSlice(const TensorShape& shape,
                                                  const TensorSlice& slice_d,
                                                  const SrcT* ptr_s,
                                                  DstT* ptr_d) {
-  CHECK_LE(shape.dims(), kTensorSliceMaxRank) << "Only tensors of size up to "
-                                              << kTensorSliceMaxRank
-                                              << " are supported";
+  CHECK_LE(shape.dims(), kTensorSliceMaxRank)
+      << "Only tensors of size up to " << kTensorSliceMaxRank
+      << " are supported";
   // We need to compute the intersection of the two slices.
   TensorSlice inter;
   if (!slice_s.Intersect(slice_d, &inter)) {
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index bdb4921e1bb..2888c66d10f 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -101,8 +101,8 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
   // The tensor and the slice have to be compatible
   if (shape.dims() != slice.dims()) {
     return errors::Internal("Incompatible tensor shape and slice: ", "shape = ",
-                            shape.DebugString(), ", slice = ",
-                            slice.DebugString());
+                            shape.DebugString(),
+                            ", slice = ", slice.DebugString());
   }
   DataType dt = DataTypeToEnum<T>::value;
   // We need to add an entry for "name" if there isn't an entry already.
@@ -114,9 +114,9 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
     CHECK_EQ(name, ssm.name()) << ProtoShortDebugString(ssm);
     TensorShape ssm_shape(ssm.shape());
     if (!shape.IsSameSize(ssm_shape)) {
-      return errors::Internal("Mismatching shapes: existing tensor = ",
-                              ssm_shape.DebugString(), ", trying to add name ",
-                              name, ", shape = ", shape.DebugString());
+      return errors::Internal(
+          "Mismatching shapes: existing tensor = ", ssm_shape.DebugString(),
+          ", trying to add name ", name, ", shape = ", shape.DebugString());
     }
     if (dt != ssm.type()) {
       return errors::Internal(
diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md
index c9f0c532c62..5593a3d95c4 100644
--- a/tensorflow/docs_src/about/bib.md
+++ b/tensorflow/docs_src/about/bib.md
@@ -60,7 +60,7 @@ author={
     Lukasz~Kaiser and
     Manjunath~Kudlur and
     Josh~Levenberg and
-    Dan~Man\'{e} and
+    Dandelion~Man\'{e} and
     Rajat~Monga and
     Sherry~Moore and
     Derek~Murray and
diff --git a/tensorflow/docs_src/api_guides/python/TPUEstimator.md b/tensorflow/docs_src/api_guides/python/TPUEstimator.md
new file mode 100644
index 00000000000..d74d7f3181c
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/TPUEstimator.md
@@ -0,0 +1,396 @@
+# Using TPUs
+
+This document walks through the principal TensorFlow APIs necessary to make
+effective use of a [Cloud TPU](https://cloud.google.com/tpu/), and highlights
+the differences between regular TensorFlow usage, and usage on a TPU.
+
+This doc is aimed at users who:
+
+* Are familiar with TensorFlow's `Estimator` and `Dataset` APIs
+* Have maybe [tried out a Cloud TPU](https://cloud.google.com/tpu/docs/quickstart)
+  using an existing model.
+* Have, perhaps, skimmed the code of an example TPU model
+  [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
+  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+* Are interested in porting an existing `Estimator` model to
+  run on Cloud TPUs
+
+## TPUEstimator
+
+@{tf.estimator.Estimator$Estimators} are TensorFlow's model-level abstraction.
+Standard `Estimators` can drive models on CPU and GPUs. You must use
+@{tf.contrib.tpu.TPUEstimator} to drive a model on TPUs.
+
+Refer to TensorFlow's Getting Started section for an introduction to the basics
+of using a @{$get_started/premade_estimators$pre-made `Estimator`}, and
+@{$get_started/custom_estimators$custom `Estimator`s}.
+
+The `TPUEstimator` class differs somewhat from the `Estimator` class.
+
+The simplest way to maintain a model that can be run both on CPU/GPU or on a
+Cloud TPU is to define the model's inference phase (from inputs to predictions)
+outside of the `model_fn`. Then maintain separate implementations of the
+`Estimator` setup and `model_fn`, both wrapping this inference step. For an
+example of this pattern compare the `mnist.py` and `mnist_tpu.py` implementation in
+[tensorflow/models](https://github.com/tensorflow/models/tree/master/official/mnist).
+
+### Running a `TPUEstimator` locally
+
+To create a standard `Estimator` you call the constructor, and pass it a
+`model_fn`, for example:
+
+```
+my_estimator = tf.estimator.Estimator(
+  model_fn=my_model_fn)
+```
+
+The changes required to use a @{tf.contrib.tpu.TPUEstimator} on your local
+machine are relatively minor. The constructor requires two additional arguments.
+You should set the `use_tpu` argument to `False`, and pass a
+@{tf.contrib.tpu.RunConfig} as the `config` argument, as shown below:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config=tf.contrib.tpu.RunConfig()
+    use_tpu=False)
+```
+
+Just this simple change will allow you to run a `TPUEstimator` locally.
+The majority of example TPU models can be run in this local mode,
+by setting the command line flags as follows:
+
+
+```
+$> python mnist_tpu.py --use_tpu=false --master=''
+```
+
+Note: This `use_tpu=False` argument is useful for trying out the `TPUEstimator`
+API. It is not meant to be a complete TPU compatibility test. Successfully
+running a model locally in a `TPUEstimator` does not guarantee that it will
+work on a TPU.
+
+
+### Building a `tpu.RunConfig`
+
+While the default `RunConfig` is sufficient  for local training, these settings
+cannot be ignored in real usage.
+
+A more typical setup for a `RunConfig`, that can be switched to use a Cloud
+TPU, might be as follows:
+
+``` python
+import tempfile
+import subprocess
+
+class FLAGS(object):
+  use_tpu=False
+  tpu_name=None
+  # Use a local temporary path for the `model_dir`
+  model_dir = tempfile.mkdtemp()
+  # Number of training steps to run on the Cloud TPU before returning control.
+  iterations = 50
+  # A single Cloud TPU has 8 shards.
+  num_shards = 8
+
+if FLAGS.use_tpu:
+    my_project_name = subprocess.check_output([
+        'gcloud','config','get-value','project'])
+    my_zone = subprocess.check_output([
+        'gcloud','config','get-value','compute/zone'])
+    cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            tpu_names=[FLAGS.tpu_name],
+            zone=my_zone,
+            project=my_project)
+    master = tpu_cluster_resolver.get_master()
+else:
+    master = ''
+
+my_tpu_run_config = tf.contrib.tpu.RunConfig(
+    master=master,
+    evaluation_master=master,
+    model_dir=FLAGS.model_dir,
+    session_config=tf.ConfigProto(
+        allow_soft_placement=True, log_device_placement=True),
+    tpu_config=tf.contrib.tpu.TPUConfig(FLAGS.iterations,
+                                        FLAGS.num_shards),
+)
+```
+
+Then you must pass the @{tf.contrib.tpu.RunConfig} to the constructor:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu)
+```
+
+Typically the `FLAGS` would be set by command line arguments. To switch from
+training locally to training on a cloud TPU you would need to:
+
+  1) Set `FLAGS.use_tpu` to `True`
+  1) Set `FLAGS.tpu_name` so the
+     `tf.contrib.cluster_resolver.TPUClusterResolver` can find it
+  1) Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`).
+
+
+## Optimizer
+
+When training on a cloud TPU you **must** wrap the optimizer in a
+@{tf.contrib.tpu.CrossShardOptimizer}, which uses an `allreduce` to aggregate
+gradients and broadcast the result to each shard (each TPU core).
+
+The `CrossShardOptimizer` is not compatible with local training. So, to have
+the same code run both locally and on a Cloud TPU, add lines like the following:
+
+``` python
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
+if FLAGS.use_tpu:
+  optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+```
+
+If you prefer to avoid a global `FLAGS` variable in your model code, one
+approach is to set the optimizer as one of the `Estimator`'s params,
+as follows:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu,
+    params={'optimizer':optimizer})
+```
+
+## Model Function
+
+This section details the changes you must make to the model function
+(`model_fn()`) to make it `TPUEstimator` compatible.
+
+### Static shapes
+
+During regular usage TensorFlow attempts to determine the shapes of each
+`tf.Tensor` during graph construction. During execution any unknown shape
+dimensions are determined dynamically,
+see @{$programmers_guide/tensors#shape$Tensor Shapes} for more details.
+
+To run on Cloud TPUs TensorFlow models are compiled using @{$xla$XLA}.
+XLA uses a similar system for determining shapes at compile time. XLA requires
+that all tensor dimensions be statically defined at compile time. All shapes
+must evaluate to a constant, and not depend on external data, or stateful
+operations like variables or a random number generator.
+
+
+### Summaries
+
+Remove any use of `tf.summary` from your model.
+
+@{$summaries_and_tensorboard$TensorBoard summaries} are a great way see inside
+your model. A minimal set of basic summaries are automatically recorded by the
+`TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however,
+are currently unsupported when training on a Cloud TPU. So while the
+`TPUEstimator` will still run locally with summaries, it will fail if used on a
+TPU.
+
+### Metrics
+
+Build your evaluation metrics dictionary in a stand-alone `metric_fn`.
+
+<!-- TODO(markdaoust) link to programmers_guide/metrics when it exists -->
+
+Evaluation metrics are an essential part of training a model. These are fully
+supported on Cloud TPUs, but with a slightly different syntax.
+
+A standard @{tf.metrics} returns two tensors. The first returns the running
+average of the metric value, while the second updates the running average and
+returns the value for this batch:
+
+```
+running_average, current_batch = tf.metrics.accuracy(labels, predictions)
+```
+
+In a standard `Estimator` you create a dictionary of these pairs, and return it
+as part of the `EstimatorSpec`.
+
+```python
+my_metrics = {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.estimator.EstimatorSpec(
+  ...
+  eval_metric_ops=my_metrics
+)
+```
+
+In a `TPUEstimator` you instead pass a function (which returns a metrics
+dictionary) and a list of argument tensors, as shown below:
+
+```python
+def my_metric_fn(labels, predictions):
+   return {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.contrib.tpu.TPUEstimatorSpec(
+  ...
+  eval_metrics=(my_metric_fn, [labels, predictions])
+)
+```
+
+### Use `TPUEstimatorSpec`
+
+`TPUEstimatorSpec` do not support hooks, and require function wrappers for
+some fields.
+
+An `Estimator`'s `model_fn` must return an `EstimatorSpec`. An `EstimatorSpec`
+is a simple structure of named fields containing all the `tf.Tensors` of the
+model that the `Estimator` may need to interact with.
+
+`TPUEstimators` use a @{tf.contrib.tpu.TPUEstimatorSpec}. There are a few
+differences between it and a standard @{tf.estimator.EstimatorSpec}:
+
+
+*  The `eval_metric_ops` must be wrapped into a `metrics_fn`, this field is
+   renamed `eval_metrics` ([see above](#metrics)).
+*  The @{tf.train.SessionRunHook$hooks} are unsupported, so these fields are
+   omitted.
+*  The @{tf.train.Scaffold$`scaffold`}, if used, must also be wrapped in a
+   function. This field is renamed to `scaffold_fn`.
+
+`Scaffold` and `Hooks` are for advanced usage, and can typically be omitted.
+
+## Input functions
+
+Input functions work mainly unchanged as they run on the host computer, not the
+Cloud TPU itself. This section explains the two necessary adjustments.
+
+### Params argument
+
+<!-- TODO(markdaoust) link to input_fn doc when it exists -->
+
+The `input_fn` for a standard `Estimator` _can_ include a
+`params` argument; the `input_fn` for a `TPUEstimator` *must* include a
+`params` argument. This is necessary to allow the estimator to set the batch
+size for each replica of the input stream. So the minimum signature for an
+`input_fn` for a `TPUEstimator` is:
+
+```
+def my_input_fn(params):
+  pass
+```
+
+Where `params['batch-size']` will contain the batch size.
+
+### Static shapes and batch size
+
+The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
+free strict static shape requirements imposed by the XLA/TPU environment. The
+one requirement is that the batches of data fed from your input pipeline to
+the TPU have a static shape, as determined by the standard TensorFlow shape
+inference algorithm. Intermediate tensors are free to have a dynamic shapes.
+If shape inference has failed, but the shape is known it is possible to
+impose the correct shape using `tf.set_shape()`. 
+
+In the example below the shape
+inference algorithm fails, but it is corrected using `set_shape`:
+
+```
+>>> x = tf.zeros(tf.constant([1,2,3])+1)
+>>> x.shape
+
+TensorShape([Dimension(None), Dimension(None), Dimension(None)])
+
+>>> x.set_shape([2,3,4])
+```
+
+In many cases the batch size is the only unknown dimension.
+
+A typical input pipeline, using `tf.data`, will usually produce batches of a
+fixed size. The last batch of a finite `Dataset`, however, is typically smaller,
+containing just the remaining elements. Since a `Dataset` does not know its own
+length or finiteness, the standard @{tf.data.Dataset.batch$`batch`} method
+cannot determine if all batches will have a fixed size batch on its own:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().batch(params['batch-size'])
+>>> ds
+
+<BatchDataset shapes: (?, 3), types: tf.int32>
+```
+
+The most straightforward fix is to
+@{tf.data.Dataset.apply$apply} @{tf.contrib.data.batch_and_drop_remainder}
+as follows:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().apply(
+...     tf.contrib.data.batch_and_drop_remainder(params['batch-size']))
+>>> ds
+
+ <_RestructuredDataset shapes: (32, 3), types: tf.int32>
+```
+
+The one downside to this approach is that, as the name implies, this batching
+method throws out any fractional batch at the end of the dataset. This is fine
+for an infinitely repeating dataset being used for training, but could be a
+problem if you want to train for an exact number of epochs.
+
+To do an exact 1-epoch of _evaluation_ you can work around this by manually
+padding the length of the batches, and setting the padding entries to have zero
+weight when creating your `tf.metrics`.
+
+## Datasets
+
+Efficient use of the `tf.data.Dataset` API is critical when using a Cloud
+TPU, as it is impossible to use the Cloud TPU's unless you can feed it data
+quickly enough. See @{$datasets_performance} for details on dataset performance.
+
+For all but the simplest experimentation (using
+@{tf.data.Dataset.from_tensor_slices} or other in-graph data) you will need to
+store all data files read by the `TPUEstimator`'s `Dataset` in Google Cloud
+Storage Buckets.
+
+<!--TODO(markdaoust): link to the `TFRecord` doc when it exists.-->
+
+For most use-cases, we recommend converting your data into `TFRecord`
+format and using a @{tf.data.TFRecordDataset} to read it. This, however, is not
+a hard requirement and you can use other dataset readers
+(`FixedLengthRecordDataset` or `TextLineDataset`) if you prefer.
+
+Small datasets can be loaded entirely into memory using
+@{tf.data.Dataset.cache}.
+
+Regardless of the data format used, it is strongly recommended that you
+@{$performance_guide#use_large_files$use large files}, on the order of
+100MB. This is especially important in this networked setting as the overhead
+of opening a file is significantly higher.
+
+It is also important, regardless of the type of reader used, to enable buffering
+using the `buffer_size` argument to the constructor. This argument is specified
+in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
+that data is available when needed.
+
+The TPU-demos repo includes
+[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+for downloading the imagenet dataset and converting it to an appropriate format.
+This together with the imagenet
+[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+included in the repo demonstrate all of these best-practices.
+
+
+## What Next
+
+For details on how to actually set up and run a Cloud TPU see:
+
+ * [Google Cloud TPU Documentation](https://cloud.google.com/tpu/docs/)
+
+This document is by no means exhaustive. The best source of more detail on how
+to make a Cloud TPU compatible model are the example models published in:
+
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+
+For more information about tuning TensorFlow code for performance see:
+
+ * The @{$performance$Performance Section.}
+
diff --git a/tensorflow/docs_src/api_guides/python/regression_examples.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
index 45cb9d829cf..7de2be05521 100644
--- a/tensorflow/docs_src/api_guides/python/regression_examples.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -38,7 +38,7 @@ The preceding examples rely on the following data set utility:
   <tr> <th>Utility</th> <th>Description</th></tr>
 
   <tr>
-    <td><a href="../../examples/get_started/regression/imports85.py">imports85.py</a></td>
+    <td><a href="https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/imports85.py">imports85.py</a></td>
     <td>This program provides utility functions that load the
         <tt>imports85</tt> data set into formats that other TensorFlow
         programs (for example, <tt>linear_regression.py</tt> and
@@ -229,4 +229,4 @@ passed through to the `model_fn` when the `model_fn` is called.
 The `model_fn` returns an
 @{tf.estimator.EstimatorSpec$`EstimatorSpec`} which is a simple structure
 indicating to the `Estimator` which operations should be run to accomplish
-varions tasks.
+various tasks.
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index a3abf255075..9f6fe91b149 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -12,7 +12,6 @@ The source code for TensorFlow is on
 Before contributing to TensorFlow source code, please review the
 [Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
 
-
 ### Projects developed by the TensorFlow community
 
 The TensorFlow community has created many great projects around TensorFlow, including:
@@ -65,5 +64,6 @@ please read the following list carefully:
     [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
-
+  * To report vulnerabilities, please follow our
+    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/SECURITY.md).
 
diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/get_started/checkpoints.md
index 680e1c0d3f5..dfa2110e691 100644
--- a/tensorflow/docs_src/get_started/checkpoints.md
+++ b/tensorflow/docs_src/get_started/checkpoints.md
@@ -16,7 +16,7 @@ This document focuses on checkpoints. For details on SavedModel, see the
 ## Sample code
 
 This document relies on the same
-[https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py](Iris classification example) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
+[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
 To download and access the example, invoke the following two commands:
 
 ```shell
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 6343cc4ee45..42a246678a0 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -15,7 +15,7 @@ git clone https://github.com/tensorflow/models/
 cd models/samples/core/get_started
 ```
 
-In this document we wil be looking at
+In this document we will be looking at
 [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py).
 You can run it with the following command:
 
@@ -161,7 +161,7 @@ classifier = tf.estimator.Estimator(
 
 To implement a typical model function, you must do the following:
 
-* (Define the model)[#define_the_model].
+* [Define the model](#define_the_model).
 * Specify additional calculations for each of
   the [three different modes](#modes):
   * [Predict](#predict)
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
index ecfbf160f0d..a8a2ab6e561 100644
--- a/tensorflow/docs_src/get_started/datasets_quickstart.md
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -169,7 +169,7 @@ the number of examples in the `Dataset` ensures that the data is completely
 shuffled. The Iris data set only contains 150 examples.
 
 The @{tf.data.Dataset.repeat$`repeat`} method has the `Dataset` restart when
-it reaches the end. To limit the number of epochss, set the `count` argument.
+it reaches the end. To limit the number of epochs, set the `count` argument.
 
 The @{tf.data.Dataset.repeat$`batch`} method collects a number of examples and
 stacks them, to create batches. This adds a dimension to their shape. The new
@@ -282,7 +282,7 @@ produce the necessary `(features, label)` pairs.
 
 We will start by building a function to parse a single line.
 
-The following `iris_data.parse_line` function acomplishes this taks using the
+The following `iris_data.parse_line` function accomplishes this task using the
 @{tf.decode_csv} function, and some simple python code:
 
 We must parse each of the lines in the dataset in order to generate the
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
index e3308ed716d..ad3e1fe3e3a 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -461,8 +461,8 @@ permitting a richer palette of numbers for every cell, an embedding column
 contains far fewer cells than an indicator column.
 
 Let's look at an example comparing indicator and embedding columns. Suppose our
-input examples consists of different words from a limited palette of only 81
-words. Further suppose that the data set provides provides the following input
+input examples consist of different words from a limited palette of only 81
+words. Further suppose that the data set provides the following input
 words in 4 separate examples:
 
 * `"dog"`
diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
index ea1c2fb3f47..390cc81eef7 100644
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -700,7 +700,7 @@ for pred_dict, expec in zip(predictions, expected):
 
     class_id = pred_dict['class_ids'][0]
     probability = pred_dict['probabilities'][class_id]
-    print(template.format(SPECIES[class_id], 100 * probability, expec))
+    print(template.format(iris_data.SPECIES[class_id], 100 * probability, expec))
 ```
 
 Running the program yields the following output:
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index dbc35065abf..4f01f997c33 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -2,37 +2,39 @@
 # Getting Started with TensorFlow
 
 This document introduces the TensorFlow programming environment and shows you
-how to write the Iris classification problem in TensorFlow.
+how to solve the Iris classification problem in TensorFlow.
 
-Prior to reading this document, do the following:
+## Prerequisites
+
+Prior to using the sample code in this document, you'll need to do the
+following:
 
 * @{$install$Install TensorFlow}.
 * If you installed TensorFlow with virtualenv or Anaconda, activate your
   TensorFlow environment.
-* To keep the data import simple, our Iris example uses Pandas. You can
-  install Pandas with:
+* Install or upgrade pandas by issuing the following command:
 
-      `pip install pandas`
+        pip install pandas
 
 ## Getting the sample code
 
-Take the following steps to get the sample code for this program:
+Take the following steps to get the sample code we'll be going through:
 
-1. Clone the TensorFlow Models repository from github by entering the following
+1. Clone the TensorFlow Models repository from GitHub by entering the following
    command:
 
-       `git clone https://github.com/tensorflow/models`
+        git clone https://github.com/tensorflow/models
 
 1. Change directory within that branch to the location containing the examples
    used in this document:
 
-       `cd models/samples/core/get_started/`
+        cd models/samples/core/get_started/
 
 The program described in this document is
 [`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
 This program uses
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
-To fetch its training data.
+to fetch its training data.
 
 ### Running the program
 
@@ -45,7 +47,7 @@ python premade_estimator.py
 The program should output training logs followed by some predictions against
 the test set. For example, the first line in the following output shows that
 the model thinks there is a 99.6% chance that the first example in the test
-set is a Setosa. Since the test set `expected "Setosa"`, this appears to be
+set is a Setosa. Since the test set expected Setosa, this appears to be
 a good prediction.
 
 ``` None
@@ -61,9 +63,9 @@ If the program generates errors instead of answers, ask yourself the following
 questions:
 
 * Did you install TensorFlow properly?
-* Are you using the correct version of tensorflow?
+* Are you using the correct version of TensorFlow?
 * Did you activate the environment you installed TensorFlow in? (This is
-  only relevant in certain installation environments.)
+  only relevant in certain installation mechanisms.)
 
 ## The programming stack
 
@@ -74,18 +76,15 @@ provides a programming stack consisting of multiple API layers:
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="../images/tensorflow_programming_environment.png">
 </div>
-<div style="text-align: center">
-The TensorFlow Programming Environment
-</div>
 
 We strongly recommend writing TensorFlow programs with the following APIs:
 
-* @{tf.estimator$Estimators}, which represent a complete model.
+* @{$programmers_guide/estimators$Estimators}, which represent a complete model.
   The Estimator API provides methods to train the model, to judge the model's
   accuracy, and to generate predictions.
 * @{$get_started/datasets_quickstart$Datasets}, which build a data input
   pipeline. The Dataset API has methods to load and manipulate data, and feed
-  it into your model. The Datasets API meshes well with the Estimators API.
+  it into your model. The Dataset API meshes well with the Estimators API.
 
 ## Classifying irises: an overview
 
@@ -120,7 +119,7 @@ individual Iris flowers:
 * petal length
 * petal width
 
-Our model will represent these features as float32 numerical data.
+Our model will represent these features as `float32` numerical data.
 
 The label identifies the Iris species, which must be one of the following:
 
@@ -154,9 +153,6 @@ The following figure illustrates the features, hidden layers, and predictions
   alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
   src="../images/custom_estimators/full_network.png">
 </div>
-<div style="text-align: center">
-The Model.
-</div>
 
 ### Inference
 
@@ -174,12 +170,12 @@ example is an Iris Versicolor.
 
 ## Overview of programming with Estimators
 
-An Estimator is TensorFlow's high level representation of a complete model. It
+An Estimator is TensorFlow's high-level representation of a complete model. It
 handles the details of initialization, logging, saving and restoring, and many
 other features so you can concentrate on your model. For more details see
 @{$programmers_guide/estimators}.
 
-An "Estimator" is any class derived from @{tf.estimator.Estimator}. TensorFlow
+An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow
 provides a collection of
 [pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
@@ -199,7 +195,7 @@ following tasks:
 * Call one or more methods on the Estimator object, passing the appropriate
   input function as the source of the data.
 
-Let's see how those tasks are implemented in Iris.
+Let's see how those tasks are implemented for Iris classification.
 
 ## Create input functions
 
@@ -209,17 +205,30 @@ evaluating, and prediction.
 An **input function** is a function that returns a @{tf.data.Dataset} object
 which outputs the following two-element tuple:
 
-* "features" - A Python dictionary in which:
+* [`features`](https://developers.google.com/machine-learning/glossary/#feature) - A Python dictionary in which:
     * Each key is the name of a feature.
     * Each value is an array containing all of that feature's values.
-* "label" - An array containing the values of the
+* `label` - An array containing the values of the
   [label](https://developers.google.com/machine-learning/glossary/#label) for
   every example.
 
-Your input function may generate the "features" dictionary and "label" list any
-way you like. However, we recommend using TensorFlow's @{tf.data.Dataset} API,
-which can deftly parse all sorts of data. At a high-level,
-the @{tf.data.Dataset} API consists of the following classes:
+Just to demonstrate the format of the input function, here's a simple
+implementation:
+
+```python
+def input_evaluation_set():
+    features = {'SepalLength': np.array([6.4, 5.0]),
+                'SepalWidth':  np.array([2.8, 2.3]),
+                'PetalLength': np.array([5.6, 3.3]),
+                'PetalWidth':  np.array([2.2, 1.0])}
+    labels = np.array([2, 1])
+    return features, labels
+```
+
+Your input function may generate the `features` dictionary and `label` list any
+way you like. However, we recommend using TensorFlow's Dataset API, which can
+parse all sorts of data. At a high level, the Dataset API consists of the
+following classes:
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%"
@@ -227,23 +236,23 @@ the @{tf.data.Dataset} API consists of the following classes:
   src="../images/dataset_classes.png">
 </div>
 
+Where the individual members are:
 
-Where:
-
-* Dataset: Base class containing methods to create and transform datasets. Also
-  allows you to initialize a dataset from data in memory, or from a Python
-  generator.
-* TextLineDataset: Reads lines from text files.
-* TFRecordDataset: Reads records from TFRecord files.
-* FixedLengthRecordDataset: Reads fixed size records from binary files.
-* Iterator: Provides a way to access one data set element at a time.
+* `Dataset` - Base class containing methods to create and transform
+  datasets. Also allows you to initialize a dataset from data in memory, or from
+  a Python generator.
+* `TextLineDataset` - Reads lines from text files.
+* `TFRecordDataset` - Reads records from TFRecord files.
+* `FixedLengthRecordDataset` - Reads fixed size records from binary files.
+* `Iterator` - Provides a way to access one data set element at a time.
 
 The Dataset API can handle a lot of common cases for you. For example,
 using the Dataset API, you can easily read in records from a large collection
 of files in parallel and join them into a single stream.
 
-To keep things simple in this example we are going to load the data with pandas,
-and build our input pipeline from this in-memory data.
+To keep things simple in this example we are going to load the data with
+[pandas](https://pandas.pydata.org/), and build our input pipeline from this
+in-memory data.
 
 Here is the input function used for training in this program, which is available
 in [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py):
@@ -258,9 +267,9 @@ def train_input_fn(features, labels, batch_size):
     return dataset.shuffle(1000).repeat().batch(batch_size)
 ```
 
-## Define the Feature Columns
+## Define the feature columns
 
-A [**Feature Column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
+A [**feature column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
 is an object describing how the model should use raw input data from the
 features dictionary. When you build an Estimator model, you pass it a list of
 feature columns that describes each of the features you want the model to use.
@@ -270,7 +279,7 @@ to the model.
 For Iris, the 4 raw features are numeric values, so we'll build a list of
 feature columns to tell the Estimator model to represent each of the four
 features as 32-bit floating-point values. Therefore, the code to create the
-Feature Column is simply:
+feature column is:
 
 ```python
 # Feature columns describe how to use the input.
@@ -279,29 +288,29 @@ for key in train_x.keys():
     my_feature_columns.append(tf.feature_column.numeric_column(key=key))
 ```
 
-Feature Columns can be far more sophisticated than those we're showing here.
-We detail feature columns @{$get_started/feature_columns$later on} in
-getting started.
+Feature columns can be far more sophisticated than those we're showing here.  We
+detail feature columns @{$get_started/feature_columns$later on} in our Getting
+Started guide.
 
 Now that we have the description of how we want the model to represent the raw
 features, we can build the estimator.
 
 
-## Instantiate an Estimator
+## Instantiate an estimator
 
 The Iris problem is a classic classification problem. Fortunately, TensorFlow
 provides several pre-made classifier Estimators, including:
 
-* @{tf.estimator.DNNClassifier}—for deep models that perform multi-class
+* @{tf.estimator.DNNClassifier} for deep models that perform multi-class
   classification.
-* @{tf.estimator.DNNLinearCombinedClassifier}—for wide-n-deep models.
-* @{tf.estimator.LinearClassifier}— for classifiers based on linear models.
+* @{tf.estimator.DNNLinearCombinedClassifier} for wide & deep models.
+* @{tf.estimator.LinearClassifier} for classifiers based on linear models.
 
 For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice.
 Here's how we instantiated this Estimator:
 
 ```python
-# Build 2 hidden layer DNN with 10, 10 units respectively.
+# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer.
 classifier = tf.estimator.DNNClassifier(
     feature_columns=my_feature_columns,
     # Two hidden layers of 10 nodes each.
@@ -363,7 +372,7 @@ Test set accuracy: 0.967
 
 We now have a trained model that produces good evaluation results.
 We can now use the trained model to predict the species of an Iris flower
-based on some unlabeled measurments. As with training and evaluation, we make
+based on some unlabeled measurements. As with training and evaluation, we make
 predictions using a single function call:
 
 ```python
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index ba1a4118aec..14add7c77e6 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 87cc647317a..d2af9d98434 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.5.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 37e109a6e4b..e5388c4b1e0 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.5.0-rc1</version>
+                 <version>1.5.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -123,12 +123,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
 </dependency>
 ```
 
@@ -147,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.5.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.5.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.5.0.zip).
   3. Extract this .zip file.
 
 
@@ -225,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.5.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.5.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.5.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.5.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 03f12dff08c..cd8c14599fb 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -31,13 +31,13 @@ If you are installing TensorFlow with GPU support using one of the
 mechanisms described in this guide, then the following NVIDIA software
 must be installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Ensure that you create the `CUDA_HOME` environment variable as
     described in the NVIDIA documentation.
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 555a6837d8b..f49d3a2f085 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -115,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -238,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -347,7 +347,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -520,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py2-none-any.whl
 </pre>
 
 
@@ -528,5 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.5.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index f494cc7a7c0..ccf62a169f3 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -133,7 +133,7 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 8.0.
+  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0.
     For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
@@ -221,7 +221,7 @@ problem, do either of the following:
   * Download Xcode 7.2 and select it as your default by issuing the following
     command:
 
-    <pre> $ <b>sudo xcode-select -s /Application/Xcode-7.2/Xcode.app</b></pre>
+    <pre> $ <b>sudo xcode-select -s /Applications/Xcode-7.2/Xcode.app</b></pre>
 
 **NOTE:** Your system must fulfill the NVIDIA software requirements described
 in one of the following documents:
@@ -272,8 +272,6 @@ Found possible Python library paths:
 Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
 
 Using python library path: /usr/local/lib/python2.7/dist-packages
-Do you wish to build TensorFlow with MKL support? [y/N]
-No MKL support will be enabled for TensorFlow
 Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
 Do you wish to use jemalloc as the malloc implementation? [Y/n]
 jemalloc enabled
@@ -291,11 +289,11 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: <b>8.0</b>
-Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: <b>6</b>
-Please specify the location where cuDNN 6 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
+Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
@@ -361,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.5.0rc1 on Linux:
+for TensorFlow 1.5.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.5.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -463,8 +461,8 @@ Stack Overflow and specify the `tensorflow` tag.
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
 
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -480,7 +478,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
@@ -493,8 +491,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.5.0-rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.5.0-rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 8d0eb7966fd..86a111c2ec1 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -30,13 +30,13 @@ If you are installing TensorFlow with GPU support using one of the mechanisms
 described in this guide, then the following NVIDIA software must be
 installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's
     documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/)
     Ensure that you append the relevant Cuda pathnames to the `%PATH%`
     environment variable as described in the NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 10e7ad7ada5..cd47fc2803b 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -498,7 +498,7 @@ For TensorFlow source versions after 1.3.0:
 ```bash
 ./configure
 # Pick the desired options
-bazel build --config=mkl -c opt //tensorflow/tools/pip_package:build_pip_package
+bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package
 
 ```
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 9eaee270282..c1a90dee0a6 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -214,7 +214,7 @@ navigate between these screens by clicking the `<--` and
 ### Other Features of the tfdbg CLI
 
 In addition to the commands listed above, the tfdbg CLI provides the following
-addditional features:
+additional features:
 
 *   To navigate through previous tfdbg commands, type in a few characters
     followed by the Up or Down arrow keys. tfdbg will show you the history of
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 2b4896c3810..9049a5a9f3d 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -125,14 +125,14 @@ an operation:
   @{tf.Tensor} accepts an optional `name` argument. For example,
   `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named
   `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph
-  already contained an operation named `"answer"`, the TensorFlow would append
+  already contains an operation named `"answer"`, then TensorFlow would append
   `"_1"`, `"_2"`, and so on to the name, in order to make it unique.
 
 * The @{tf.name_scope} function makes it possible to add a **name scope** prefix
   to all operations created in a particular context. The current name scope
   prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope}
   context managers. If a name scope has already been used in the current
-  context, TensorFlow appens `"_1"`, `"_2"`, and so on. For example:
+  context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example:
 
   ```python
   c_0 = tf.constant(0, name="c")  # => operation named "c"
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index d45e666ce7b..7a5e90081d9 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -13,7 +13,7 @@ works. The units are as follows:
 ## Low Level APIs
 
   * @{$programmers_guide/low_level_intro}, which introduces the
-    basics of how you can to use TensorFlow outside of the high Level APIs.
+    basics of how you can use TensorFlow outside of the high Level APIs.
   * @{$programmers_guide/tensors}, which explains how to create,
     manipulate, and access Tensors--the fundamental object in TensorFlow.
   * @{$programmers_guide/variables}, which details how
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 9f50be5b31c..f27a658342b 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -285,7 +285,7 @@ with tf.Session(graph=tf.Graph()) as sess:
 ```
 
 
-### Loading a Savedmodel in C++
+### Loading a SavedModel in C++
 
 The C++ version of the SavedModel
 [loader](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/loader.h)
@@ -303,6 +303,30 @@ LoadSavedModel(session_options, run_options, export_dir, {kSavedModelTagTrain},
                &bundle);
 ```
 
+### Loading and Serving a SavedModel in TensorFlow Serving
+
+You can easily load and serve a SavedModel with the TensorFlow Serving Model
+Server binary. See [instructions](https://www.tensorflow.org/serving/setup#installing_using_apt-get)
+on how to install the server, or build it if you wish.
+
+Once you have the Model Server, run it with:
+```
+tensorflow_model_server --port=port-numbers --model_name=your-model-name --model_base_path=your_model_base_path
+```
+Set the port and model_name flags to values of your choosing. The
+model_base_path flag expects to be to a base directory, with each version of
+your model residing in a numerically named subdirectory. If you only have a
+single version of your model, simply place it in a subdirectory like so:
+* Place the model in /tmp/model/0001
+* Set model_base_path to /tmp/model
+
+Store different versions of your model in numerically named subdirectories of a
+common base directory. For example, suppose the base directory is `/tmp/model`.
+If you have only one version of your model, store it in `/tmp/model/0001`. If
+you have two versions of your model, store the second version in
+`/tmp/model/0002`, and so on.  Set the `--model-base_path` flag to the base
+directory (`/tmp/model`, in this example).  TensorFlow Model Server will serve
+the model in the highest numbered subdirectory of that base directory.
 
 ### Standard constants
 
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index f7bdf8b816a..0767726aa9a 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -56,10 +56,12 @@ def nativeOutDir = 'libs/' + cpuType
 def nativeBuildRule = 'buildNativeBazel'
 def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
+
+// Override for Makefile builds.
 if (nativeBuildSystem == 'makefile') {
     nativeBuildRule = 'buildNativeMake'
-    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
-    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
+    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_demo.so'
+    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_inference.so'
 }
 
 // If building with Bazel, this is the location of the bazel binary.
@@ -154,7 +156,8 @@ task buildNativeMake(type: Exec) {
          '-s',  \
          'tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in',  \
          '-t',  \
-         'libtensorflow_inference.so libtensorflow_demo.so'  \
+         'libtensorflow_inference.so libtensorflow_demo.so all'  \
+         , '-a', cpuType  \
          //, '-T'  // Uncomment to skip protobuf and speed up subsequent builds.
 }
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 2fe2ba539ed..af6af2bc8f5 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -199,7 +199,7 @@ public class MultiBoxTracker {
       final int w,
       final int h,
       final int rowStride,
-      final int sensorOrienation,
+      final int sensorOrientation,
       final byte[] frame,
       final long timestamp) {
     if (objectTracker == null && !initialized) {
@@ -209,7 +209,7 @@ public class MultiBoxTracker {
       objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
       frameWidth = w;
       frameHeight = h;
-      this.sensorOrientation = sensorOrienation;
+      this.sensorOrientation = sensorOrientation;
       initialized = true;
 
       if (objectTracker == null) {
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index ec22684eaf6..58c5f87884e 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -344,8 +344,8 @@ def maybe_download_and_extract(data_url):
     filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
     print()
     statinfo = os.stat(filepath)
-    tf.logging.info('Successfully downloaded', filename, statinfo.st_size,
-                    'bytes.')
+    tf.logging.info('Successfully downloaded %s %d bytes.',
+                    filename, statinfo.st_size)
     print('Extracting file from ', filepath)
     tarfile.open(filepath, 'r:gz').extractall(dest_directory)
   else:
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index eb117c39a12..e4e61862b02 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -34,8 +34,7 @@ MAX_LABEL = 15
 WORDS_FEATURE = 'words'  # Name of the input words feature.
 
 
-def estimator_spec_for_softmax_classification(
-    logits, labels, mode):
+def estimator_spec_for_softmax_classification(logits, labels, mode):
   """Returns EstimatorSpec instance for softmax classification."""
   predicted_classes = tf.argmax(logits, 1)
   if mode == tf.estimator.ModeKeys.PREDICT:
@@ -53,8 +52,8 @@ def estimator_spec_for_softmax_classification(
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 
   eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
+      'accuracy':
+          tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
   }
   return tf.estimator.EstimatorSpec(
       mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
@@ -67,8 +66,7 @@ def bag_of_words_model(features, labels, mode):
   bow_embedding_column = tf.feature_column.embedding_column(
       bow_column, dimension=EMBEDDING_SIZE)
   bow = tf.feature_column.input_layer(
-      features,
-      feature_columns=[bow_embedding_column])
+      features, feature_columns=[bow_embedding_column])
   logits = tf.layers.dense(bow, MAX_LABEL, activation=None)
 
   return estimator_spec_for_softmax_classification(
@@ -110,9 +108,9 @@ def main(unused_argv):
   # Prepare training and testing data
   dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.Series(dbpedia.train.data[:,1])
+  x_train = pandas.Series(dbpedia.train.data[:, 1])
   y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.Series(dbpedia.test.data[:,1])
+  x_test = pandas.Series(dbpedia.test.data[:, 1])
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
@@ -152,10 +150,7 @@ def main(unused_argv):
 
   # Predict.
   test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_test},
-      y=y_test,
-      num_epochs=1,
-      shuffle=False)
+      x={WORDS_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
   y_predicted = y_predicted.reshape(np.array(y_test).shape)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index fb3ac942039..47dd6a19478 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A very simple MNIST classifier.
 
 See extensive documentation at
@@ -67,12 +66,19 @@ def main(_):
   # Test trained model
   correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  print(sess.run(accuracy, feed_dict={x: mnist.test.images,
-                                      y_: mnist.test.labels}))
+  print(sess.run(
+      accuracy, feed_dict={
+          x: mnist.test.images,
+          y_: mnist.test.labels
+      }))
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
+  parser.add_argument(
+      '--data_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory for storing input data')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index d055d157454..f6906b0f79b 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -270,12 +270,6 @@ with tf.Session(graph=graph) as session:
         run_metadata=run_metadata)
     average_loss += loss_val
 
-    # Add returned summaries to writer in each step.
-    writer.add_summary(summary, step)
-    # Add metadata to visualize the graph for the last run.
-    if step == (num_steps - 1):
-      writer.add_run_metadata(run_metadata, 'step%d' % step)
-
     # Add returned summaries to writer in each step.
     writer.add_summary(summary, step)
     # Add metadata to visualize the graph for the last run.
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index fc087d9d995..08943a527cb 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -173,7 +173,11 @@ type OpSpec struct {
 	// operation.
 	Attrs map[string]interface{}
 
-	// Other possible fields: Device, ColocateWith, ControlInputs.
+	// Operations that must be executed before executing the operation
+	// being added.
+	ControlDependencies []*Operation
+
+	// Other possible fields: Device, ColocateWith.
 }
 
 // AddOperation adds an operation to g.
@@ -204,6 +208,9 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			}
 		}
 	}
+	for _, in := range args.ControlDependencies {
+		C.TF_AddControlInput(cdesc, in.c)
+	}
 	status := newStatus()
 	for name, value := range args.Attrs {
 		if err := setAttr(cdesc, status, name, value); err != nil {
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index a9ec79463a0..13de4294dc2 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -33,10 +33,11 @@ import (
 // A Scope object and all its derivates (e.g., obtained from Scope.SubScope)
 // are not safe for concurrent use by multiple goroutines.
 type Scope struct {
-	graph     *tf.Graph
-	namemap   map[string]int
-	namespace string
-	err       *scopeErr
+	graph               *tf.Graph
+	namemap             map[string]int
+	namespace           string
+	controlDependencies []*tf.Operation
+	err                 *scopeErr
 }
 
 // scopeErr is used to share errors between all derivatives of a root scope.
@@ -80,6 +81,7 @@ func (s *Scope) AddOperation(args tf.OpSpec) *tf.Operation {
 	if s.namespace != "" {
 		args.Name = s.namespace + "/" + args.Name
 	}
+	args.ControlDependencies = append(args.ControlDependencies, s.controlDependencies...)
 	op, err := s.graph.AddOperation(args)
 	if err != nil {
 		s.UpdateErr(args.Type, err)
@@ -103,6 +105,28 @@ func (s *Scope) SubScope(namespace string) *Scope {
 	}
 }
 
+// WithControlDependencies returns a new Scope which will cause all operations
+// added to the graph to execute only after all the provided operations have
+// executed first (in addition to any other control dependencies in s).
+func (s *Scope) WithControlDependencies(ops ...*tf.Operation) *Scope {
+	// Force a copy of the control dependencies into a new underlying array on
+	// every call.  We cannot alias the same underlying array as `ops`, otherwise
+	// the user could modify that array after calling s.WithControlDependencies,
+	// which would be confusing.  We cannot alias the same underlying array as the
+	// original `s.controlDependencies`, since Scopes form a logical tree, and
+	// other calls to s.WithControlDependencies could stomp on each other.
+	deps := make([]*tf.Operation, 0, len(s.controlDependencies)+len(ops))
+	deps = append(deps, s.controlDependencies...)
+	deps = append(deps, ops...)
+	return &Scope{
+		graph:               s.graph,
+		namemap:             s.namemap,
+		namespace:           s.namespace,
+		controlDependencies: deps,
+		err:                 s.err,
+	}
+}
+
 // Err returns the error, if any, encountered during the construction
 // of the Graph managed by s.
 //
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 6fb5d32e503..b58a61de98b 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -69,6 +69,49 @@ func TestScopeSubScopeErrors(t *testing.T) {
 	}
 }
 
+func TestControlDependencies(t *testing.T) {
+	var (
+		s        = NewScope()
+		zero     = Const(s.SubScope("zero"), int32(0))
+		one      = Const(s.SubScope("one"), int32(1))
+		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
+		init     = AssignVariableOp(s, variable, zero)
+		update   = AssignAddVariableOp(s, variable, one)
+		readDeps = []*tf.Operation{update}
+	)
+	// We intend for `read` to have a control dependency on `update`.
+	s = s.WithControlDependencies(readDeps...)
+	// Ensure that Scope.WithControlDependencies makes a copy of the underlying
+	// array, rather than just holding a slice reference to the same user-supplied
+	// underlying array.  If the copy is correctly performed, overwriting
+	// readDeps[0] should have no effect on control dependencies for `read`.
+	readDeps[0] = init
+	read := ReadVariableOp(s, variable, tf.Int32)
+
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err = sess.Run(nil, nil, []*tf.Operation{init}); err != nil {
+		t.Fatal(err)
+	}
+	// Without the control dependency, the read operation may not see the
+	// update.
+	for i := int32(0); i < 10; i++ {
+		out, err := sess.Run(nil, []tf.Output{read}, nil)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if got, want := out[0].Value().(int32), i+1; got != want {
+			t.Errorf("Got %d, want %d", got, want)
+		}
+	}
+}
+
 func TestScopeFinalize(t *testing.T) {
 	var (
 		root = NewScope()
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 5b19c90238e..cb47651d7b3 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -8729,31 +8729,6 @@ func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
@@ -21290,6 +21265,31 @@ func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output
 	return op.Output(0)
 }
 
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Performs a padding as a preprocess during a convolution.
 //
 // Similar to FusedResizeAndPadConv2d, this op allows for an optimized
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 6285ee0483d..a9ce5372aeb 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index b0e5c44fecc..fe34ca83ff3 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 02c5dca13f4..390152808eb 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 949597ca7f1..524ec45f48b 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.5.0-rc1</version>
+  <version>1.5.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 9f0ebcf84c9..9cf3217f51f 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 88d897362ad..d619f986a9a 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.5.0-rc1</version>
+    <version>1.5.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index 499757e8cf4..cf773e1686d 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -88,7 +88,7 @@ final class NativeLibrary {
       // Deletions are in the reverse order of requests, so we need to request that the directory be
       // deleted first, so that it is empty when the request is fulfilled.
       tempPath.deleteOnExit();
-      final String tempDirectory = tempPath.toString();
+      final String tempDirectory = tempPath.getCanonicalPath();
       if (frameworkResource != null) {
         extractResource(frameworkResource, frameworkLibName, tempDirectory);
       } else {
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 01b3e92d2d9..059bf729b07 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -76,6 +76,7 @@ py_library(
         ":layers",
         ":lib",
         ":list_ops",
+        ":manip_ops",
         ":math_ops",
         ":metrics",
         ":nn",
@@ -298,6 +299,7 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1394,6 +1396,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "manip_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
@@ -1726,6 +1736,8 @@ py_library(
         ":linalg_grad",
         ":linalg_ops",
         ":logging_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":platform",
@@ -1848,6 +1860,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "manip_grad",
+    srcs = ["ops/manip_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":manip_ops",
+    ],
+)
+
+py_library(
+    name = "manip_ops",
+    srcs = ["ops/manip_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":manip_ops_gen",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "logging_ops",
     srcs = ["ops/logging_ops.py"],
@@ -2310,6 +2345,8 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
@@ -2668,6 +2705,7 @@ cuda_py_test(
         ":nn_ops_gen",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
     tags = ["no_windows"],
 )
 
@@ -4228,12 +4266,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "hidden_ops",
-    srcs = ["ops/hidden_ops.txt"],
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cuda_py_test(
     name = "accumulate_n_benchmark",
     size = "large",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index bc9ddec2a54..ea7604d30f1 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -84,6 +84,7 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import image_ops as image
+from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
@@ -241,6 +242,7 @@ _allowed_symbols.extend([
     'linalg',
     'logging',
     'losses',
+    'manip',
     'metrics',
     'newaxis',
     'nn',
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index 7f29adc06fc..b9056f86e6d 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -22,7 +22,6 @@ def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
   bare_op_name = name[:-4] # Strip off the _gen
   tf_gen_op_wrapper_py(name=bare_op_name,
     out=out,
-    hidden_file="ops/hidden_ops.txt",
     visibility=visibility,
     deps=deps,
     require_shape_functions=require_shape_functions,
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index 7bba10efacf..aaf41626ab0 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,8 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
+      self.assertTrue("GPU" in [d.device_type for d in devices] or
+                      "SYCL" in [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/notebook.py b/tensorflow/python/client/notebook.py
index 8babe35b323..4b6a0f71ae6 100644
--- a/tensorflow/python/client/notebook.py
+++ b/tensorflow/python/client/notebook.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Notebook front-end to TensorFlow.
 
 When you run this binary, you'll see something like below, which indicates
@@ -43,10 +42,8 @@ from tensorflow.python.platform import app
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION"] = "2"
 
-
 FLAGS = None
 
-
 ORIG_ARGV = sys.argv
 # Main notebook process calls itself with argv[1]="kernel" to start kernel
 # subprocesses.
@@ -73,8 +70,8 @@ def main(unused_argv):
       notebookapp.ip = "0.0.0.0"
       notebookapp.password = passwd(FLAGS.password)
     else:
-      print ("\nNo password specified; Notebook server will only be available"
-             " on the local machine.\n")
+      print("\nNo password specified; Notebook server will only be available"
+            " on the local machine.\n")
     notebookapp.initialize(argv=["--notebook-dir", FLAGS.notebook_dir])
 
     if notebookapp.ip == "0.0.0.0":
@@ -125,8 +122,8 @@ if __name__ == "__main__":
   # kernel app.
   if IS_KERNEL:
     # Drop everything except --flagfile.
-    sys.argv = ([sys.argv[0]] +
-                [x for x in sys.argv[1:] if x.startswith("--flagfile")])
+    sys.argv = (
+        [sys.argv[0]] + [x for x in sys.argv[1:] if x.startswith("--flagfile")])
 
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index e6f94396b85..6befeb846d8 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class SessionInterface(object):
@@ -1441,6 +1442,7 @@ class BaseSession(SessionInterface):
       return handles
 
 
+@tf_export('Session')
 class Session(BaseSession):
   """A class for running TensorFlow operations.
 
@@ -1581,6 +1583,7 @@ class Session(BaseSession):
     tf_session.TF_Reset(target, containers, config)
 
 
+@tf_export('InteractiveSession')
 class InteractiveSession(BaseSession):
   """A TensorFlow `Session` for use in interactive contexts, such as a shell.
 
diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py
index 721bca91b71..06e9a099267 100644
--- a/tensorflow/python/client/session_benchmark.py
+++ b/tensorflow/python/client/session_benchmark.py
@@ -22,6 +22,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 768a5db88aa..f12c0055115 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -46,6 +46,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
@@ -1745,8 +1746,10 @@ class SessionTest(test_util.TensorFlowTestCase):
   def runTestBuildGraphError(self, sess):
     # Ensure that errors from building the graph get propagated.
     data = array_ops.placeholder(dtypes.float32, shape=[])
-    enter_1 = control_flow_ops.enter(data, 'foo_1', False)
-    enter_2 = control_flow_ops.enter(data, 'foo_2', False)
+    # pylint: disable=protected-access
+    enter_1 = gen_control_flow_ops._enter(data, 'foo_1', False)
+    enter_2 = gen_control_flow_ops._enter(data, 'foo_2', False)
+    # pylint: enable=protected-access
     res = math_ops.add(enter_1, enter_2)
     with self.assertRaisesOpError('has inputs from different frames'):
       sess.run(res, feed_dict={data: 1.0})
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 43cbde69d9d..8b8adefa65a 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -357,6 +357,9 @@ tf_py_test(
         "//tensorflow/python:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:lookup_ops",
     ],
     grpc_enabled = True,
     tags = [
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
index 45dfa13720b..2c65c49ebdc 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
@@ -21,6 +21,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
@@ -28,6 +29,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
@@ -103,6 +106,40 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:1/cpu:0",
                                    workers[0].target)
 
+  def testCaptureHashTableInSharedIterator(self):
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        shared_name="shared_table")
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        ["brain brain tank salad surgery", "surgery brain"])
+
+    iterator = (
+        input_sentences.map(lambda x: string_ops.string_split([x]).values).map(
+            table.lookup)
+        .make_initializable_iterator(shared_name="shared_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(table.init)
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next))
+
+    with session.Session(worker[0].target) as sess:
+      self.assertAllEqual([2, 0], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c1ba67e4744..c4b7e4919bb 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -41,8 +41,10 @@ from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("data.Dataset")
 class Dataset(object):
   """Represents a potentially large set of elements.
 
@@ -556,6 +558,8 @@ class Dataset(object):
         - /path/to/dir/b.py
         - /path/to/dir/c.py
 
+    NOTE: The order of the file names returned can be non-deterministic.
+
     Args:
       file_pattern: A string or scalar string `tf.Tensor`, representing
         the filename pattern that will be matched.
@@ -899,10 +903,11 @@ class Dataset(object):
 
     Args:
       transformation_func: A function that takes one `Dataset` argument and
-        returns a `Dataset`.
+          returns a `Dataset`.
 
     Returns:
-      Dataset: The `Dataset` returned by applying `transformation_func` to this dataset.
+      Dataset: The `Dataset` returned by applying `transformation_func` to this
+          dataset.
     """
     dataset = transformation_func(self)
     if not isinstance(dataset, Dataset):
@@ -1454,6 +1459,19 @@ def _padding_value_to_tensor(value, output_type):
   return value
 
 
+def _default_padding(input_dataset):
+
+  def make_zero(t):
+    if t.base_dtype == dtypes.string:
+      return ""
+    elif t.base_dtype == dtypes.variant:
+      raise TypeError("Unable to create padding for field of type 'variant'")
+    else:
+      return np.zeros_like(t.as_numpy_dtype())
+
+  return nest.map_structure(make_zero, input_dataset.output_types)
+
+
 class PaddedBatchDataset(Dataset):
   """A `Dataset` that batches and pads contiguous elements from its input."""
 
@@ -1469,23 +1487,13 @@ class PaddedBatchDataset(Dataset):
         batch_size, dtype=dtypes.int64, name="batch_size")
     padding_values = (
         padding_values
-        if padding_values is not None else self._default_padding(input_dataset))
+        if padding_values is not None else _default_padding(input_dataset))
     self._padded_shapes = nest.map_structure_up_to(
         input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes)
     self._padding_values = nest.map_structure_up_to(
         input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
         input_dataset.output_types)
 
-  def _default_padding(self, input_dataset):
-
-    def make_zero(t):
-      if t.base_dtype == dtypes.string:
-        return ""
-      else:
-        return np.zeros_like(t.as_numpy_dtype())
-
-    return nest.map_structure(make_zero, input_dataset.output_types)
-
   def _as_variant_tensor(self):
     return gen_dataset_ops.padded_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 0cbdb3ab19d..e573fe01928 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 # NOTE(mrry): It is legitimate to call `Iterator.get_next()` multiple
@@ -47,6 +48,7 @@ GET_NEXT_CALL_WARNING_MESSAGE = (
     "`next_element` inside the loop.")
 
 
+@tf_export("data.Iterator")
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
 
@@ -165,8 +167,10 @@ class Iterator(object):
     iterator_resource = gen_dataset_ops.iterator(
         container="",
         shared_name=shared_name,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -232,8 +236,10 @@ class Iterator(object):
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     iterator_resource = gen_dataset_ops.iterator_from_string_handle(
         string_handle,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 830dc5cec4a..fa7601741b1 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -23,12 +23,14 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/64974358): Increase default buffer size to 256 MB.
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
+@tf_export("data.TextLineDataset")
 class TextLineDataset(Dataset):
   """A `Dataset` comprising lines from one or more text files."""
 
@@ -71,6 +73,7 @@ class TextLineDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.TFRecordDataset")
 class TFRecordDataset(Dataset):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
@@ -115,6 +118,7 @@ class TFRecordDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.FixedLengthRecordDataset")
 class FixedLengthRecordDataset(Dataset):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 2455395635c..e90ce3fb40a 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -266,7 +266,7 @@ def map_structure(func, *structure, **check_types_dict):
   and the return value will contain the results in the same structure.
 
   Args:
-    func: A callable that acceps as many arguments are there are structures.
+    func: A callable that accepts as many arguments are there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
@@ -383,8 +383,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
             "structure has keys %s, while shallow structure has keys %s." %
             (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
@@ -479,8 +479,8 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   The `inputs`, can be thought of as having the same structure as
   `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
-  This function therefore will return something with the same base structure as
-  `shallow_tree`.
+  This function, therefore, will return something with the same base structure
+  as `shallow_tree`.
 
   Examples:
 
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index 90dd7dfe777..ff380815a4a 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -277,6 +277,10 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
   def testFlattenUpTo(self):
     input_tree = (((2, 2), (3, 3)), ((4, 9), (5, 5)))
     shallow_tree = ((True, True), (False, True))
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 5ebcb4ea81b..5e6d2247097 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -141,7 +141,7 @@ def serialize_sparse_tensors(tensors):
     tensors: a tensor structure to serialize.
 
   Returns:
-    `tensors` with any sparse tensors replaced by the their serialized version.
+    `tensors` with any sparse tensors replaced by their serialized version.
   """
 
   ret = nest.pack_sequence_as(tensors, [
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index d4aea76d652..e0759a8bc1a 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -535,7 +535,7 @@ def numeric_summary(tensor):
   if not isinstance(tensor, np.ndarray) or not np.size(tensor):
     return debugger_cli_common.RichTextLines([
         "No numeric summary available due to empty tensor."])
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     counts = [
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 704dbda357d..3821b393ec6 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -44,6 +44,10 @@ def main(_):
   sess.run(tf.global_variables_initializer())
 
   # Wrap the TensorFlow Session object for debugging.
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 
@@ -52,6 +56,9 @@ def main(_):
 
     sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
     sess.add_tensor_filter("has_negative", has_negative)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   print("Fibonacci number at position %d:\n%s" %
         (FLAGS.length, sess.run(n1)))
@@ -82,7 +89,15 @@ if __name__ == "__main__":
       "--debug",
       dest="debug",
       action="store_true",
-      help="Use TensorFlow Debugger (tfdbg).")
+      help="Use TensorFlow Debugger (tfdbg). Mutually exclusive with the "
+      "--tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
 
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index 0a6dbf311d8..ab1c90371cd 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -120,8 +120,15 @@ def main(_):
 
   sess.run(tf.global_variables_initializer())
 
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   # Add this point, sess is a debug wrapper around the actual Session if
   # FLAGS.debug is true. In that case, calling run() will launch the CLI.
@@ -173,6 +180,14 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 92314d8dd9f..4f4666ee4fa 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -110,10 +110,16 @@ def main(_):
       model_dir=model_dir)
 
   hooks = None
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
                                             dump_root=FLAGS.dump_root)
-    hooks = [debug_hook]
+  elif FLAGS.tensorboard_debug_address:
+    debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)
+  hooks = [debug_hook]
 
   if not FLAGS.use_experiment:
     # Fit model.
@@ -185,11 +191,19 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
   parser.add_argument(
       "--dump_root",
       type=str,
       default="",
       help="Optional custom root directory for temporary debug dump data")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index c4b13a1045d..8d355aa27f6 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -222,7 +222,7 @@ def has_inf_or_nan(datum, tensor):
     # Also return False for data types that cannot be represented as numpy
     # arrays.
     return False
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index b6c7280a415..c1e9869d978 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -22,6 +22,7 @@ import shutil
 import tempfile
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_gradients
@@ -38,7 +39,11 @@ from tensorflow.python.training import gradient_descent
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self.sess = session.Session()
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
     with self.sess.as_default():
       self.u = variables.Variable(2.0, name="u")
       self.v = variables.Variable(3.0, name="v")
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index 367b3535450..b623ee31c5d 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -54,7 +54,8 @@ from tensorflow.python.training import monitored_session
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
       disable_model_pruning=True,
-      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index acea9433e22..254201c3937 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -389,6 +389,11 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         r"mode\."):
       sess.invoke_node_stepper(node_stepper)
 
+  def testDumpingWrapperWithEmptyFetchWorks(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    sess.run([])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 909150eb6aa..c530204bbf6 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -121,7 +121,9 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.debug.lib import stepper
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import monitored_session
+from tensorflow.python.util import nest
 
 
 # Helper function.
@@ -439,7 +441,12 @@ class BaseDebugWrapperSession(session.SessionInterface):
             "callable_runner and fetches/feed_dict are mutually exclusive, but "
             "are used simultaneously.")
 
-    if self._is_disabled_thread():
+    empty_fetches = not nest.flatten(fetches)
+    if empty_fetches:
+      tf_logging.info(
+          "Due to empty fetches, tfdbg Session wrapper is letting a "
+          "Session.run pass through without any debugging actions.")
+    if self._is_disabled_thread() or empty_fetches:
       if callable_runner:
         return callable_runner(*callable_runner_args)
       else:
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 989ad801e53..0204254ccab 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -35,10 +35,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self,
-               ui_type="curses",
-               dump_root=None,
-               thread_name_filter=None):
+  def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -62,7 +59,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
     """Add a tensor filter.
 
     See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-    Override default behavior to accommodate the possibility of this method being
+    Override default behavior to accommodate the possibility of this method
+    being
     called prior to the initialization of the underlying
     `LocalCLIDebugWrapperSession` object.
 
@@ -137,9 +135,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
       # pylint: enable=protected-access
 
       with stepper.NodeStepper(
-          run_context.session,
-          run_context.original_args.
-          fetches,
+          run_context.session, run_context.original_args.fetches,
           run_context.original_args.feed_dict) as node_stepper:
         self._session_wrapper.invoke_node_stepper(
             node_stepper, restore_variable_values_on_exit=True)
@@ -149,8 +145,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   def after_run(self, run_context, run_values):
     # Adapt run_context and run_values to OnRunEndRequest and invoke superclass
     # on_run_end()
-    on_run_end_request = framework.OnRunEndRequest(
-        self._performed_action, run_values.run_metadata)
+    on_run_end_request = framework.OnRunEndRequest(self._performed_action,
+                                                   run_values.run_metadata)
     self._session_wrapper.on_run_end(on_run_end_request)
 
 
@@ -260,8 +256,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
     self._thread_name_filter = thread_name_filter
     self._grpc_debug_server_addresses = (
         grpc_debug_server_addresses
-        if isinstance(grpc_debug_server_addresses, list)
-        else [grpc_debug_server_addresses])
+        if isinstance(grpc_debug_server_addresses, list) else
+        [grpc_debug_server_addresses])
 
     self._watch_fn = watch_fn
     self._log_usage = log_usage
@@ -334,6 +330,7 @@ class TensorBoardDebugHook(GrpcDebugHook):
       log_usage: Whether the usage of this class is to be logged (if
         applicable).
     """
+
     def _gated_grpc_watch_fn(fetches, feeds):
       del fetches, feeds  # Unused.
       return framework.WatchOptions(
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 770a496aa9d..490812c96d8 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -664,6 +664,20 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         [["run"], ["run"]], monitored_sess)
     self.assertFalse(wrapped_monitored_sess.should_stop())
 
+  def testRunsWithEmptyFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run([])
+    self.assertEqual([], run_output)
+
+  def testRunsWithEmptyNestedFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run({"foo": {"baz": []}, "bar": ()})
+    self.assertEqual({"foo": {"baz": []}, "bar": ()}, run_output)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 9e3382d4f30..ab81d401484 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -206,29 +206,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "python_eager_op_gen_main",
-    srcs = [
-        "python_eager_op_gen_main.cc",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":python_eager_op_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_cc_binary(
-    name = "python_eager_op_gen_demo",
-    deps = [
-        ":python_eager_op_gen_main",
-        "//tensorflow/core:ops",
-    ],
-)
-
 py_library(
     name = "custom_gradient",
     srcs = ["custom_gradient.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index a2a3e230bbb..d79d1fc0a64 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -734,7 +734,7 @@ def _num_elements(grad):
   raise ValueError("`grad` not a Tensor or IndexedSlices.")
 
 
-_last_shape_dtype = [None, None]
+_last_zero_shape_dtype = [None, None]
 _last_zero = [None]
 
 
@@ -748,13 +748,15 @@ def _zeros(shape, dtype):
     # TODO(apassos): need to save enough information about variant tensors to do
     # a zeros
     return None
-  if [shape, dtype] != _last_shape_dtype:
-    _last_shape_dtype[:] = [shape, dtype]
+  if [shape, dtype] != _last_zero_shape_dtype:
+    _last_zero_shape_dtype[:] = [shape, dtype]
     _last_zero[0] = _fast_fill(0, shape, dtype)
   return _last_zero[0]
 
 
 def _ones(shape, dtype):
+  if shape == ():  # pylint: disable=g-explicit-bool-comparison
+    return constant_op.constant(1, dtype=dtype)
   return _fast_fill(1, shape, dtype)
 
 
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 2f1654dda49..988442c971f 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -153,7 +153,7 @@ def inf_nan_callback(op_type,
       continue
 
     numpy_dtype = output.dtype.as_numpy_dtype
-    if (np.issubdtype(numpy_dtype, np.float) or
+    if (np.issubdtype(numpy_dtype, np.floating) or
         np.issubdtype(numpy_dtype, np.complex) or
         np.issubdtype(numpy_dtype, np.integer)):
       try:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 81b1f6f12a1..246df9afefb 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -292,6 +292,22 @@ def _map_sequence_obj_to_idx(sequence):
   return {id(x): i for i, x in enumerate(sequence)}
 
 
+def _flatten(sequence):
+  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
+  # TODO(akshayka): Support `SparseTensor` in a similar fashion.
+  flat_sequence = nest.flatten(sequence)
+  outputs = []
+  for item in flat_sequence:
+    if isinstance(item, ops.IndexedSlices):
+      if item.dense_shape is not None:
+        outputs.extend([item.values, item.indices, item.dense_shape])
+      else:
+        outputs.extend([item.values, item.indices])
+    else:
+      outputs.append(item)
+  return outputs
+
+
 class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
 
@@ -333,14 +349,14 @@ class GraphModeFunction(object):
     self._input_placeholders = input_placeholders
     self._extra_inputs = list(extra_inputs)
     self._graph = graph
-    self._has_backprop = False
+    self._backward_function = None
     self._func_name = name
     self._function_def = defined_function
     self._num_outputs = len(defined_function.signature.output_arg)
     self._ops = operations
     self._func_outputs = func_outputs
     self._returns = [func_outputs] if isinstance(
-        func_outputs, (ops.Tensor, type(None))) else list(func_outputs)
+        func_outputs, (ops.Tensor, type(None))) else _flatten(func_outputs)
     self._output_shapes = output_shapes
     self._variables = variables if variables is not None else []
 
@@ -348,9 +364,8 @@ class GraphModeFunction(object):
   def variables(self):
     return self._variables
 
-  def _compute_backprop(self):
-    """Computes the backprop function object for this function."""
-    self._has_backprop = True
+  def _construct_backprop_function(self):
+    """Constructs the backprop function object for this function."""
     with self._graph.as_default(), context.graph_mode():
       c = _CapturingContext()
       with c:
@@ -361,13 +376,16 @@ class GraphModeFunction(object):
             filtered_outputs,
             self._input_placeholders,
             grad_ys=self._out_grad_placeholders)
-        shapes = tuple(x.shape for x in in_gradients if x is not None)
+
+    backward_outputs = tuple(
+        grad for grad in _flatten(in_gradients) if grad is not None)
+    output_shapes = tuple(grad.shape for grad in backward_outputs)
+
     captures = list(sorted(c.captured_tensors, key=lambda x: x.name))
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
         filtered_outputs + captures)
-    backward_outputs = tuple(x for x in in_gradients if x is not None)
     all_inputs = self._out_grad_placeholders + captures
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
@@ -381,7 +399,7 @@ class GraphModeFunction(object):
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
         bname, all_inputs, [], self._graph, function_def_ops,
-        backward_outputs, in_gradients, shapes)
+        backward_outputs, in_gradients, output_shapes)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
@@ -426,9 +444,24 @@ class GraphModeFunction(object):
 
   @property
   def output_shapes(self):
+    """The function's output shapes."""
     # TODO(ebrevdo): Should we only keep the output shapes associated
     # with len(self._returns) outputs?
-    return nest.pack_sequence_as(self._func_outputs, self._output_shapes)
+    outputs_list = nest.flatten(self._func_outputs)
+    j = 0
+    for i, o in enumerate(outputs_list):
+      if o is not None:
+        if isinstance(o, ops.IndexedSlices):
+          # Extract the shape of the `IndexedSlices` object's `values` field.
+          outputs_list[i] = self._output_shapes[j]  # the `values` shape
+          if o.dense_shape is not None:
+            j += 3  # skip over shapes for `values`, `indices`, `dense_shape`
+          else:
+            j += 2  # skip over shapes for `values`, `indices`
+        else:
+          outputs_list[i] = self._output_shapes[j]
+          j += 1
+    return nest.pack_sequence_as(self._func_outputs, outputs_list)
 
   @property
   def output_dtypes(self):
@@ -457,12 +490,11 @@ class GraphModeFunction(object):
       if v._trainable:  # pylint: disable=protected-access
         tape.watch_variable(v)
 
-    tensor_inputs = [x for x in nest.flatten(args)
-                     if isinstance(x, ops.Tensor)]
+    tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
     if tape.should_record(tensor_inputs) or tape.should_record(
         self._extra_inputs):
-      if not self._has_backprop:
-        self._compute_backprop()
+      if self._backward_function is None:
+        self._construct_backprop_function()
       return self._backprop_call(tensor_inputs)
 
     ctx = context.context()
@@ -503,13 +535,30 @@ class GraphModeFunction(object):
     """
     if self._func_outputs is None:
       return None
+    # Use `nest.flatten` instead of `_flatten` in order to preserve any
+    # IndexedSlices in `self._func_outputs`.
     outputs_list = nest.flatten(self._func_outputs)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
-        outputs_list[i] = result[j]
-        j += 1
-    return nest.pack_sequence_as(self._func_outputs, outputs_list)
+        if isinstance(o, ops.IndexedSlices):
+          # Repack Tensors for IndexedSlices.
+          if o.dense_shape is not None:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1],
+                dense_shape=result[j + 2])
+            j += 3
+          else:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1])
+            j += 2
+        else:
+          outputs_list[i] = result[j]
+          j += 1
+    ret = nest.pack_sequence_as(self._func_outputs, outputs_list)
+    return ret
 
 
 def _get_defun_inputs(args):
@@ -526,15 +575,13 @@ def _get_defun_inputs(args):
 
 def _defun_internal(name, func, args, kwds):
   """Defines and returns graph-mode version of func."""
-  container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+  graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
     captures = {}
     tmp_graph = CapturingGraph(captures)
-    # Inherit the container prefix, since this is used for error checking when
-    # isolating eager execution (the container prefix at creation must match the
-    # container prefix when used, and variables accessed in the defun will be
-    # used in the outside context).
-    tmp_graph._container_prefix = container_prefix  # pylint: disable=protected-access
+    # Inherit the graph key, since this is used for matching variables in
+    # optimizers.
+    tmp_graph._graph_key = graph_key  # pylint: disable=protected-access
     # Copy the graph collections to ensure summaries and other things work. This
     # lets the function access (but not mutate) collections of the containing
     # graph, such as the global step and the summary writer collections.
@@ -555,7 +602,7 @@ def _defun_internal(name, func, args, kwds):
 
         # Returning a closed-over tensor as an output does not trigger a
         # call to convert_to_tensor, so we manually capture all such tensors.
-        outputs_list = nest.flatten(func_outputs)
+        outputs_list = _flatten(func_outputs)
         func_def_outputs = [
             _convert_to_graph_tensor(x) for x in outputs_list if x is not None
         ]
@@ -600,6 +647,18 @@ def _cache_key(x):
   """Cache key for tfe functions."""
   if isinstance(x, ops.Tensor):
     return _TensorDtype(x.dtype, x._shape_tuple())  # pylint: disable=protected-access
+  if isinstance(x, ops.IndexedSlices):
+    if x.dense_shape is not None:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.dense_shape.dtype, x.dense_shape._shape_tuple())  # pylint: disable=protected-access
+      ])
+    else:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple())  # pylint: disable=protected-access
+      ])
   if isinstance(x, np.ndarray):
     return ("array", x.shape, tuple(x.reshape(-1)))
   if isinstance(x, (list, tuple)):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 0babc29f17b..3e8e67ac7e2 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -374,6 +374,78 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
 
+  def testGradientOfGatherWithDefun(self):
+
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testReturningIndexedSlicesWithDefun(self):
+
+    def validate(indexed_slice):
+      def f():
+        return indexed_slice
+
+      output = function.defun(f)()
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(indexed_slice.values, output.values)
+      self.assertAllEqual(indexed_slice.indices, output.indices)
+      self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape)
+
+      self.assertEqual(
+          function.make_defun_op(f).output_shapes, indexed_slice.values.shape)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=constant_op.constant([2]))
+    validate(arg)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=None)
+    validate(arg)
+
+  def testIndexedSliceAsArgumentWithDefun(self):
+
+    @function.defun
+    def f(indexed_slice):
+      return indexed_slice
+
+    def validate(arg):
+      output = f(arg)
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(arg.values, output.values)
+      self.assertAllEqual(arg.indices, output.indices)
+      self.assertAllEqual(arg.dense_shape, output.dense_shape)
+
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=constant_op.constant([1]))
+    validate(indexed_slice)
+
+    # Test that `f` works even when `dense_shape` is None.
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=None)
+    validate(indexed_slice)
+
   def testFunctionOnDevice(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -504,6 +576,24 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(ret[0][2], 10)
     self.assertAllEqual(ret[1], 15)
 
+  def testVariableNamesRespectNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope('foo'):
+        v = resource_variable_ops.ResourceVariable(0.0, name='bar')
+      self.assertEqual(v.name, 'foo/bar:0')
+    create_variable()
+
+  def testVariableNamesRespectNameScopesWithDefunInGraph(self):
+    with context.graph_mode():
+      @function.defun
+      def create_variable():
+        with ops.name_scope('foo'):
+          v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
+        self.assertEqual(v.name, 'foo/bar:0')
+      with ops.get_default_graph().as_default():
+        create_variable()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/gen_op.bzl b/tensorflow/python/eager/gen_op.bzl
deleted file mode 100644
index 8bc1d6c10a6..00000000000
--- a/tensorflow/python/eager/gen_op.bzl
+++ /dev/null
@@ -1,65 +0,0 @@
-"""For eager-mode Python."""
-
-load("//tensorflow:tensorflow.bzl",
-     "clean_dep",
-     "tf_binary_additional_srcs",
-     "tf_copts",
-     "tf_cc_binary")
-
-def tfe_gen_op_wrapper_py(name,
-                          out=None,
-                          visibility=None,
-                          deps=[],
-                          generated_target_name=None,
-                          # ApiDefs will be loaded in the order specified in this list.
-                          api_def_srcs=[]):
-  """Generate an eager-mode Python op wrapper for an op library."""
-  # Construct a cc_binary containing the specified ops.
-  tool_name = "gen_" + name + "_py_wrappers_cc"
-  if not deps:
-    deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
-  tf_cc_binary(
-      name=tool_name,
-      linkopts=["-lm"],
-      copts=tf_copts(),
-      linkstatic=1,
-      deps=([
-          clean_dep("//tensorflow/python/eager:python_eager_op_gen_main")
-      ] + deps),
-      visibility=[clean_dep("//visibility:public")],)
-
-  # Invoke the previous cc_binary to generate a python file.
-  if not out:
-    out = "gen_" + name + ".py"
-
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
-
-  native.genrule(
-      name=name + "_pygenrule",
-      outs=[out],
-      srcs=api_def_srcs,
-      tools=[tool_name] + tf_binary_additional_srcs(),
-      cmd=("$(location " + tool_name + ") " + api_def_args_str + " > $@"))
-
-  # Make a py_library out of the generated python file.
-  if not generated_target_name:
-    generated_target_name = name
-  native.py_library(
-      name=generated_target_name,
-      srcs=[out],
-      srcs_version="PY2AND3",
-      visibility=visibility,
-      deps=[
-          clean_dep("//tensorflow/python/eager:framework_for_generated_wrappers"),
-      ],)
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index 5c13ea89081..62106bf0e28 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -252,21 +252,17 @@ def _graph_callable_internal(func, shape_and_dtypes):
     Callable graph object.
   """
   container = tf_ops.get_default_graph()._container  # pylint: disable=protected-access
-  container_prefix = tf_ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+  graph_key = tf_ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
     # This graph will store both the initialization and the call version of the
     # wrapped function. It will later be used by the backprop code to build the
     # backprop graph, if necessary.
     captures = {}
     tmp_graph = function.CapturingGraph(captures)
-    # Inherit the container from the original graph to create resources at user
-    # expected containers. Also inherits the container prefix, since this is
-    # used for error checking when isolating Eager execution (the container
-    # prefix at creation must match the container prefix when used, and
-    # variables returned from the graph callable will be used in the outside
-    # context).
+    # Inherit the graph key from the original graph to ensure optimizers don't
+    # misbehave.
     tmp_graph._container = container  # pylint: disable=protected-access
-    tmp_graph._container_prefix = container_prefix  # pylint: disable=protected-access
+    tmp_graph._graph_key = graph_key  # pylint: disable=protected-access
     with tmp_graph.as_default():
       # Placeholders for the non-variable inputs.
       func_inputs = _get_graph_callable_inputs(shape_and_dtypes)
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 90a8779ff84..0f18f28c955 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -756,11 +756,21 @@ from tensorflow.python.util.tf_export import tf_export
   auto out = cleaned_ops.mutable_op();
   out->Reserve(ops.op_size());
   for (const auto& op_def : ops.op()) {
-    bool is_hidden = false;
-    for (const string& hidden : hidden_ops) {
-      if (op_def.name() == hidden) {
-        is_hidden = true;
-        break;
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+
+    if (api_def->visibility() == ApiDef::SKIP) {
+      continue;
+    }
+
+    // An op is hidden if either its ApiDef visibility is HIDDEN
+    // or it is in the hidden_ops list.
+    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    if (!is_hidden) {
+      for (const string& hidden : hidden_ops) {
+        if (op_def.name() == hidden) {
+          is_hidden = true;
+          break;
+        }
       }
     }
 
@@ -777,7 +787,6 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
     strings::StrAppend(&result,
                        GetEagerPythonOp(op_def, *api_def, function_name));
 
diff --git a/tensorflow/python/eager/python_eager_op_gen_main.cc b/tensorflow/python/eager/python_eager_op_gen_main.cc
deleted file mode 100644
index 05351bd8b11..00000000000
--- a/tensorflow/python/eager/python_eager_op_gen_main.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/python/eager/python_eager_op_gen.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-
-namespace tensorflow {
-namespace {
-
-void PrintAllPythonOps(const std::vector<string>& hidden_ops,
-                       const std::vector<string>& api_def_dirs) {
-  OpList ops;
-  OpRegistry::Global()->Export(false, &ops);
-
-  ApiDefMap api_def_map(ops);
-  if (!api_def_dirs.empty()) {
-    Env* env = Env::Default();
-
-    for (const auto& api_def_dir : api_def_dirs) {
-      std::vector<string> api_files;
-      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
-                                        &api_files));
-      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
-    }
-    api_def_map.UpdateDocs();
-  }
-
-  PrintEagerPythonOps(ops, api_def_map, hidden_ops, true /* require_shapes */);
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char* argv[]) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  // Usage:
-  //   python_eager_op_gen_main api_def_dir1,api_def_dir2,...
-  if (argc == 1) {
-    tensorflow::PrintAllPythonOps({}, {});
-  } else if (argc == 2) {
-    const std::vector<tensorflow::string> api_def_dirs =
-        tensorflow::str_util::Split(argv[1], ",",
-                                    tensorflow::str_util::SkipEmpty());
-    tensorflow::PrintAllPythonOps({}, api_def_dirs);
-  } else {
-    return -1;
-  }
-  return 0;
-}
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 647f03351d9..d927f3abedb 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -86,30 +86,6 @@ bool ParseBoolValue(const string& key, PyObject* py_value, TF_Status* status,
   return true;
 }
 
-const char* ParseProtoValue(const string& key, const char* proto_name,
-                            PyObject* py_value, size_t* size,
-                            TF_Status* status) {
-  char* output = nullptr;
-  Py_ssize_t py_size;
-  if (PyBytes_Check(py_value) &&
-      PyBytes_AsStringAndSize(py_value, &output, &py_size) >= 0) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
-#if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(py_value) &&
-      (output = PyUnicode_AsUTF8AndSize(py_value, &py_size)) != nullptr) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
-#endif
-  TF_SetStatus(status, TF_INVALID_ARGUMENT,
-               tensorflow::strings::StrCat("Expecting a string (serialized ",
-                                           proto_name, ") value for attr ", key)
-                   .c_str());
-  return nullptr;
-}
-
 bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
                    TF_AttrType type, TF_Status* status) {
   if (!PySequence_Check(py_list)) {
@@ -329,8 +305,9 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
 tensorflow::mutex exception_class_mutex(tensorflow::LINKER_INITIALIZED);
 PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
 
-static tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
-static tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
+tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
+tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
+
 }  // namespace
 
 void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
@@ -551,6 +528,34 @@ tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* GetTapeSet() {
   return tape_set;
 }
 
+// A safe copy of the current tapeset. Does not get affected by other python
+// threads changing the set of active tapes.
+class SafeTapeSet {
+ public:
+  SafeTapeSet() : tape_set_(*GetTapeSet()) {
+    for (auto* tape : tape_set_) {
+      Py_INCREF(tape);
+    }
+  }
+
+  ~SafeTapeSet() {
+    for (auto* tape : tape_set_) {
+      Py_DECREF(tape);
+    }
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator begin() {
+    return tape_set_.begin();
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator end() {
+    return tape_set_.end();
+  }
+
+ private:
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*> tape_set_;
+};
+
 // xcode 7 doesn't define thread_local, so for compatibility we implement our
 // own. TODO(apassos) remove once we can deprecate xcode 7.
 #ifndef __APPLE__
@@ -741,10 +746,7 @@ void TFE_Py_TapeSetWatchVariable(PyObject* variable) {
   if (*ThreadTapeIsStopped()) {
     return;
   }
-  // Note: making a copy because watching a variable can trigger a change to the
-  // set of tapes by allowing python's garbage collector to run.
-  auto tape_set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : tape_set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->WatchVariable(variable);
   }
 }
@@ -800,8 +802,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
     return;
   }
 
-  auto set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     Py_INCREF(backward_function);
     tape->tape->RecordOperation(
         op_type_str, output_info, input_ids, backward_function,
@@ -810,10 +811,7 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
 }
 
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
-  // Note: making a copy because deleting the trace can trigger a change to the
-  // set of tapes by allowing python's garbage collector to run.
-  auto tape_set = *GetTapeSet();
-  for (TFE_Py_Tape* tape : tape_set) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->DeleteTrace(tensor_id);
   }
 }
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 96e4ecd29fb..138152ac1c6 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -57,6 +57,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.training import training_util
 
 # The default learning rate of 0.3 is a historical artifact of the initial
@@ -220,7 +221,8 @@ class BaselineClassifier(estimator.Estimator):
                weight_column=None,
                label_vocabulary=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineClassifier instance.
 
     Args:
@@ -240,6 +242,8 @@ class BaselineClassifier(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineClassifier` estimator.
 
@@ -249,11 +253,13 @@ class BaselineClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
@@ -311,7 +317,8 @@ class BaselineRegressor(estimator.Estimator):
                label_dimension=1,
                weight_column=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineRegressor instance.
 
     Args:
@@ -328,13 +335,16 @@ class BaselineRegressor(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineRegressor` estimator.
     """
 
     head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
         label_dimension=label_dimension,
-        weight_column=weight_column)
+        weight_column=weight_column,
+        loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 96555b5e03c..78d74b63d3e 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -478,13 +478,16 @@ class Estimator(object):
       estimator_spec = self._call_model_fn(
           features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
       predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
+      all_hooks = list(input_hooks)
+      all_hooks.extend(hooks)
+      all_hooks.extend(list(estimator_spec.prediction_hooks or []))
       with training.MonitoredSession(
           session_creator=training.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
               master=self._config.master,
               scaffold=estimator_spec.scaffold,
               config=self._session_config),
-          hooks=input_hooks + hooks) as mon_sess:
+          hooks=all_hooks) as mon_sess:
         while not mon_sess.should_stop():
           preds_evaluated = mon_sess.run(predictions)
           if not isinstance(predictions, dict):
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 833f3dcac3b..39a5b998ebd 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -1355,6 +1355,25 @@ class EstimatorPredictTest(test.TestCase):
     est.train(dummy_input_fn, steps=1)
     self.assertEqual(10., next(est.predict(dummy_input_fn)))
 
+  def test_predictionhooks_are_used(self):
+    hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+
+    def _model_fn_hooks(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]),
+          prediction_hooks=[hook])
+
+    est = estimator.Estimator(model_fn=_model_fn_hooks)
+    est.train(dummy_input_fn, steps=1)
+    self.assertFalse(hook.begin.called)
+    next(est.predict(dummy_input_fn))
+    self.assertTrue(hook.begin.called)
+
   def test_warn_if_no_queue_runner(self):
 
     def _model_fn(features, labels, mode):
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 51075731ddc..83251c79fc5 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -36,12 +36,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 _SINGLE_FEATURE_DEFAULT_NAME = 'feature'
 _SINGLE_RECEIVER_DEFAULT_NAME = 'input'
 
 
+@tf_export('estimator.export.ServingInputReceiver')
 class ServingInputReceiver(collections.namedtuple(
     'ServingInputReceiver',
     ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
@@ -118,6 +120,7 @@ class ServingInputReceiver(collections.namedtuple(
         receiver_tensors_alternatives=receiver_tensors_alternatives)
 
 
+@tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
   """Build a serving_input_receiver_fn expecting fed tf.Examples.
@@ -146,6 +149,7 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
   return serving_input_receiver_fn
 
 
+@tf_export('estimator.export.build_raw_serving_input_receiver_fn')
 def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   """Build a serving_input_receiver_fn expecting feature Tensors.
 
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 863af6d41d9..87b964be371 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -26,8 +26,10 @@ import six
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.export.ExportOutput')
 class ExportOutput(object):
   """Represents an output of a model that can be served.
 
@@ -50,6 +52,7 @@ class ExportOutput(object):
     pass
 
 
+@tf_export('estimator.export.ClassificationOutput')
 class ClassificationOutput(ExportOutput):
   """Represents the output of a classification head.
 
@@ -118,6 +121,7 @@ class ClassificationOutput(ExportOutput):
         examples, self.classes, self.scores)
 
 
+@tf_export('estimator.export.RegressionOutput')
 class RegressionOutput(ExportOutput):
   """Represents the output of a regression head."""
 
@@ -153,6 +157,7 @@ class RegressionOutput(ExportOutput):
 _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
 
 
+@tf_export('estimator.export.PredictOutput')
 class PredictOutput(ExportOutput):
   """Represents the output of a generic prediction head.
 
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index c4c2e30e877..a6f47129100 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -24,6 +24,7 @@ import numpy as np
 from six import string_types
 
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 # Key name to pack the target into dict of `features`. See
 # `_get_unique_target_key` for details.
@@ -86,6 +87,7 @@ def _validate_and_convert_features(x):
   return ordered_dict_data
 
 
+@tf_export('estimator.inputs.numpy_input_fn')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 90d6145377d..bd06843021f 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 try:
   # pylint: disable=g-import-not-at-top
@@ -34,6 +35,7 @@ except ImportError:
   HAS_PANDAS = False
 
 
+@tf_export('estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index da202408c36..b08f83fc569 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -56,7 +56,7 @@ class EstimatorSpec(
     collections.namedtuple('EstimatorSpec', [
         'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops',
         'export_outputs', 'training_chief_hooks', 'training_hooks', 'scaffold',
-        'evaluation_hooks'
+        'evaluation_hooks', 'prediction_hooks'
     ])):
   """Ops and objects returned from a `model_fn` and passed to an `Estimator`.
 
@@ -73,7 +73,8 @@ class EstimatorSpec(
               training_chief_hooks=None,
               training_hooks=None,
               scaffold=None,
-              evaluation_hooks=None):
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a validated `EstimatorSpec` instance.
 
     Depending on the value of `mode`, different arguments are required. Namely
@@ -154,6 +155,8 @@ class EstimatorSpec(
         initialization, saver, and more to be used in training.
       evaluation_hooks: Iterable of `tf.train.SessionRunHook` objects to
         run during evaluation.
+      prediction_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run during predictions.
 
     Returns:
       A validated `EstimatorSpec` object.
@@ -282,7 +285,10 @@ class EstimatorSpec(
     training_chief_hooks = tuple(training_chief_hooks or [])
     training_hooks = tuple(training_hooks or [])
     evaluation_hooks = tuple(evaluation_hooks or [])
-    for hook in training_hooks + training_chief_hooks + evaluation_hooks:
+    prediction_hooks = tuple(prediction_hooks or [])
+
+    for hook in (training_hooks + training_chief_hooks + evaluation_hooks +
+                 prediction_hooks):
       if not isinstance(hook, session_run_hook.SessionRunHook):
         raise TypeError(
             'All hooks must be SessionRunHook instances, given: {}'.format(
@@ -305,7 +311,8 @@ class EstimatorSpec(
         training_chief_hooks=training_chief_hooks,
         training_hooks=training_hooks,
         scaffold=scaffold,
-        evaluation_hooks=evaluation_hooks)
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def _replace(self, **kwds):
     """Return a new EstimatorSpec replacing specified fields with new values."""
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
index d67c4b71616..b7eeeb437cb 100644
--- a/tensorflow/python/estimator/model_fn_test.py
+++ b/tensorflow/python/estimator/model_fn_test.py
@@ -72,7 +72,8 @@ class EstimatorSpecTrainTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
 
   def testLossNumber(self):
     """Tests that error is raised when loss is a number (not Tensor)."""
@@ -465,7 +466,17 @@ class EstimatorSpecInferTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
+
+  def testPredictionHookInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, 'All hooks must be SessionRunHook instances'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=constant_op.constant(1.),
+            prediction_hooks=[_InvalidHook()])
 
   def testPredictionsMissing(self):
     with ops.Graph().as_default(), self.test_session():
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 52fb1d39ae2..2e84c5014f6 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Classes and functions related to train_and_evaluate."""
 
 from __future__ import absolute_import
@@ -37,7 +36,6 @@ from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
 
-
 _MAX_DELAY_SECS = 60
 _DELAY_SECS_PER_WORKER = 5
 _TF_CONFIG_ENV = 'TF_CONFIG'
@@ -50,8 +48,7 @@ _TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER,
 def _validate_input_fn(input_fn):
   """Validates the `input_fn`."""
   if not callable(input_fn):
-    raise TypeError(
-        '`input_fn` must be callable, given: {}'.format(input_fn))
+    raise TypeError('`input_fn` must be callable, given: {}'.format(input_fn))
 
 
 def _validate_hooks(hooks):
@@ -125,10 +122,7 @@ class TrainSpec(
   duration. Optional hooks run at various stages of training.
   """
 
-  def __new__(cls,
-              input_fn,
-              max_steps=None,
-              hooks=None):
+  def __new__(cls, input_fn, max_steps=None, hooks=None):
     """Creates a validated `TrainSpec` instance.
 
     Args:
@@ -161,16 +155,13 @@ class TrainSpec(
     hooks = _validate_hooks(hooks)
 
     return super(TrainSpec, cls).__new__(
-        cls,
-        input_fn=input_fn,
-        max_steps=max_steps,
-        hooks=hooks)
+        cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks)
 
 
 class EvalSpec(
     collections.namedtuple('EvalSpec', [
-        'input_fn', 'steps', 'name', 'hooks', 'exporters',
-        'start_delay_secs', 'throttle_secs'
+        'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs',
+        'throttle_secs'
     ])):
   """Configuration for the "eval" part for the `train_and_evaluate` call.
 
@@ -417,8 +408,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   Raises:
     ValueError: if environment variable `TF_CONFIG` is incorrectly set.
   """
-  executor = _TrainingExecutor(estimator=estimator, train_spec=train_spec,
-                               eval_spec=eval_spec)
+  executor = _TrainingExecutor(
+      estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
 
   config = estimator.config
   if (config.task_type == run_config_lib.TaskType.EVALUATOR and
@@ -561,9 +552,8 @@ class _TrainingExecutor(object):
           self._timer.update_last_triggered_step(global_step_value)
           self._evaluator.evaluate_and_export()
         else:
-          logging.info(
-              'Skip the current checkpoint eval due to throttle secs '
-              '({} secs).'.format(self._eval_throttle_secs))
+          logging.info('Skip the current checkpoint eval due to throttle secs '
+                       '({} secs).'.format(self._eval_throttle_secs))
 
     # Final export signal: For any eval result with global_step >= train
     # max_steps, the evaluator will send the final export signal. There is a
@@ -576,8 +566,8 @@ class _TrainingExecutor(object):
     #
     # But here, throttle_secs will skip the next intermediate checkpoint and,
     # so, the double final export chance is very small.
-    evaluator = _TrainingExecutor._Evaluator(
-        self._estimator, self._eval_spec, self._train_spec.max_steps)
+    evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
+                                             self._train_spec.max_steps)
 
     # When the underlying `Estimator` object saves a new checkpoint, we would
     # like this callback to be called so that evaluation and export can trigger.
@@ -617,8 +607,7 @@ class _TrainingExecutor(object):
       raise ValueError('eval_spec.throttle_secs should be positive, given: {}.'
                        'It is used do determine how long each training '
                        'iteration should go when train and evaluate '
-                       'locally.'.format(
-                           self._eval_spec.throttle_secs))
+                       'locally.'.format(self._eval_spec.throttle_secs))
 
     stop_hook = _StopAtSecsHook(self._eval_spec.throttle_secs)
     train_hooks = (
@@ -663,8 +652,9 @@ class _TrainingExecutor(object):
 
     if not config.master:
       jobs = config.cluster_spec.jobs
-      if (len(jobs) == 1 and len(config.cluster_spec.job_tasks(jobs[0])) == 1
-          and config.task_type in _TRAINER_JOBS):
+      if (len(jobs) == 1 and
+          len(config.cluster_spec.job_tasks(jobs[0])) == 1 and
+          config.task_type in _TRAINER_JOBS):
         # For distributed training, config.master is empty if and only if it has
         # a single node in the cluster spec. In this case, we should not start
         # the server.
@@ -679,9 +669,9 @@ class _TrainingExecutor(object):
     logging.info('Start Tensorflow server.')
 
     if config.session_config is None:
-      session_config=config_pb2.ConfigProto(log_device_placement=False)
+      session_config = config_pb2.ConfigProto(log_device_placement=False)
     else:
-      session_config=config_pb2.ConfigProto(
+      session_config = config_pb2.ConfigProto(
           log_device_placement=False,
           gpu_options=config.session_config.gpu_options)
 
@@ -744,8 +734,7 @@ class _TrainingExecutor(object):
             global_step >= self._train_spec.max_steps):
           logging.info(
               'Exiting evaluation, global_step=%s >= train max_steps=%s',
-              global_step,
-              self._train_spec.max_steps)
+              global_step, self._train_spec.max_steps)
           return
 
       latest_eval_result, should_early_stop = self._execute_evaluator_once(
@@ -781,10 +770,9 @@ class _TrainingExecutor(object):
 
     # Throttle if necessary.
     elapsed_time = time.time() - start
-    difference = throttle_secs  - elapsed_time
+    difference = throttle_secs - elapsed_time
     if difference > 0:
-      logging.info('Waiting %f secs before starting next eval run.',
-                   difference)
+      logging.info('Waiting %f secs before starting next eval run.', difference)
       time.sleep(difference)
 
     return (eval_result, should_early_stop)
@@ -929,8 +917,8 @@ class _EvalResult(
       if checkpoint_path:
         raise ValueError(
             'checkpoint must be `None` if status is not {}; got status {}, '
-            'checkpoint_path {}'.format(
-                _EvalStatus.EVALUATED, status, checkpoint_path))
+            'checkpoint_path {}'.format(_EvalStatus.EVALUATED, status,
+                                        checkpoint_path))
       return super(_EvalResult, cls).__new__(cls, status, metrics,
                                              checkpoint_path)
 
diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index ad95c71234f..48110ef57fc 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -415,8 +415,8 @@ def _warm_start(warm_start_settings):
       a stronger check for variable configuration than relying on users to
       examine the logs.
   """
-  logging.info("Warm-starting from: ",
-               warm_start_settings.ckpt_to_initialize_from)
+  logging.info("Warm-starting from: %s",
+               (warm_start_settings.ckpt_to_initialize_from,))
   # We have to deal with partitioned variables, since get_collection flattens
   # out the list.
   grouped_variables = {}
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 7feb209cc49..5947d8f6e23 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -157,6 +157,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _internal_input_layer(features,
@@ -209,6 +210,7 @@ def _internal_input_layer(features,
     return array_ops.concat(output_tensors, 1)
 
 
+@tf_export('feature_column.input_layer')
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -329,6 +331,7 @@ class InputLayer(object):
     return self._input_layer_template.weights
 
 
+@tf_export('feature_column.linear_model')
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -498,6 +501,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
+@tf_export('feature_column.make_parse_example_spec')
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -557,6 +561,7 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
+@tf_export('feature_column.embedding_column')
 def embedding_column(
     categorical_column, dimension, combiner='mean', initializer=None,
     ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
@@ -807,6 +812,7 @@ def shared_embedding_columns(
   return result
 
 
+@tf_export('feature_column.numeric_column')
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -881,6 +887,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -970,6 +977,7 @@ def _assert_string_or_int(dtype, prefix):
         '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1026,6 +1034,7 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_file')
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1145,6 +1154,7 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
 def categorical_column_with_vocabulary_list(
     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
@@ -1255,6 +1265,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
@@ -1322,6 +1333,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1350,6 +1362,7 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
+@tf_export('feature_column.weighted_categorical_column')
 def weighted_categorical_column(
     categorical_column, weight_feature_key, dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
@@ -1424,6 +1437,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 67ccf990d6a..c8251144836 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Library of dtypes (Tensor element types)."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.util.tf_export import tf_export
 
-
 _np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
@@ -83,8 +80,8 @@ class DType(object):
     # TODO(mrry): Make the necessary changes (using __new__) to ensure
     # that calling this returns one of the interned values.
     type_enum = int(type_enum)
-    if (type_enum not in types_pb2.DataType.values()
-        or type_enum == types_pb2.DT_INVALID):
+    if (type_enum not in types_pb2.DataType.values() or
+        type_enum == types_pb2.DT_INVALID):
       raise TypeError(
           "type_enum is not a valid types_pb2.DataType: %s" % type_enum)
     self._type_enum = type_enum
@@ -123,10 +120,10 @@ class DType(object):
 
   @property
   def is_numpy_compatible(self):
-    numpy_incompatible = [types_pb2.DT_VARIANT,
-                          types_pb2.DT_VARIANT_REF,
-                          types_pb2.DT_RESOURCE,
-                          types_pb2.DT_RESOURCE_REF]
+    numpy_incompatible = [
+        types_pb2.DT_VARIANT, types_pb2.DT_VARIANT_REF, types_pb2.DT_RESOURCE,
+        types_pb2.DT_RESOURCE_REF
+    ]
     return self._type_enum not in numpy_incompatible
 
   @property
@@ -153,9 +150,9 @@ class DType(object):
   @property
   def is_floating(self):
     """Returns whether this is a (non-quantized, real) floating point type."""
-    return ((self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
-                                                        np.floating))
-            or self.base_dtype == bfloat16)
+    return ((self.is_numpy_compatible and
+             np.issubdtype(self.as_numpy_dtype, np.floating)) or
+            self.base_dtype == bfloat16)
 
   @property
   def is_complex(self):
@@ -190,8 +187,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find minimum value of %s." % self)
 
     # there is no simple way to get the min value of a dtype, we have to check
@@ -214,8 +211,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find maximum value of %s." % self)
 
     # there is no simple way to get the max value of a dtype, we have to check
@@ -266,8 +263,8 @@ class DType(object):
       this `DType`.
     """
     other = as_dtype(other)
-    return self._type_enum in (
-        other.as_datatype_enum, other.base_dtype.as_datatype_enum)
+    return self._type_enum in (other.as_datatype_enum,
+                               other.base_dtype.as_datatype_enum)
 
   def __eq__(self, other):
     """Returns True iff this DType refers to the same type as `other`."""
@@ -307,19 +304,22 @@ class DType(object):
       return 1
     return np.dtype(self.as_numpy_dtype).itemsize
 
+
 # Define data type range of numpy dtype
-dtype_range = {np.bool_: (False, True),
-               np.bool8: (False, True),
-               np.uint8: (0, 255),
-               np.uint16: (0, 65535),
-               np.int8: (-128, 127),
-               np.int16: (-32768, 32767),
-               np.int64: (-2**63, 2**63 - 1),
-               np.uint64: (0, 2**64 - 1),
-               np.int32: (-2**31, 2**31 - 1),
-               np.uint32: (0, 2**32 - 1),
-               np.float32: (-1, 1),
-               np.float64: (-1, 1)}
+dtype_range = {
+    np.bool_: (False, True),
+    np.bool8: (False, True),
+    np.uint8: (0, 255),
+    np.uint16: (0, 65535),
+    np.int8: (-128, 127),
+    np.int16: (-32768, 32767),
+    np.int64: (-2**63, 2**63 - 1),
+    np.uint64: (0, 2**64 - 1),
+    np.int32: (-2**31, 2**31 - 1),
+    np.uint32: (0, 2**32 - 1),
+    np.float32: (-1, 1),
+    np.float64: (-1, 1)
+}
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
@@ -396,7 +396,6 @@ quint16_ref = DType(types_pb2.DT_QUINT16_REF)
 qint32_ref = DType(types_pb2.DT_QINT32_REF)
 bfloat16_ref = DType(types_pb2.DT_BFLOAT16_REF)
 
-
 # Maintain an intern table so that we don't have to create a large
 # number of small objects.
 _INTERN_TABLE = {
@@ -448,7 +447,6 @@ _INTERN_TABLE = {
     types_pb2.DT_VARIANT_REF: variant_ref,
 }
 
-
 # Standard mappings between types_pb2.DataType values and string names.
 _TYPE_TO_STRING = {
     types_pb2.DT_HALF: "float16",
@@ -498,8 +496,10 @@ _TYPE_TO_STRING = {
     types_pb2.DT_RESOURCE_REF: "resource_ref",
     types_pb2.DT_VARIANT_REF: "variant_ref",
 }
-_STRING_TO_TF = {value: _INTERN_TABLE[key]
-                 for key, value in _TYPE_TO_STRING.items()}
+_STRING_TO_TF = {
+    value: _INTERN_TABLE[key]
+    for key, value in _TYPE_TO_STRING.items()
+}
 # Add non-canonical aliases.
 _STRING_TO_TF["half"] = float16
 _STRING_TO_TF["half_ref"] = float16_ref
@@ -508,7 +508,6 @@ _STRING_TO_TF["float_ref"] = float32_ref
 _STRING_TO_TF["double"] = float64
 _STRING_TO_TF["double_ref"] = float64_ref
 
-
 # Numpy representation for quantized dtypes.
 #
 # These are magic strings that are used in the swig wrapper to identify
@@ -551,58 +550,100 @@ _NP_TO_TF = frozenset([
     (_np_bfloat16, bfloat16),
 ])
 _TF_TO_NP = {
-    types_pb2.DT_HALF: np.float16,
-    types_pb2.DT_FLOAT: np.float32,
-    types_pb2.DT_DOUBLE: np.float64,
-    types_pb2.DT_INT32: np.int32,
-    types_pb2.DT_UINT8: np.uint8,
-    types_pb2.DT_UINT16: np.uint16,
-    types_pb2.DT_UINT32: np.uint32,
-    types_pb2.DT_UINT64: np.uint64,
-    types_pb2.DT_INT16: np.int16,
-    types_pb2.DT_INT8: np.int8,
+    types_pb2.DT_HALF:
+        np.float16,
+    types_pb2.DT_FLOAT:
+        np.float32,
+    types_pb2.DT_DOUBLE:
+        np.float64,
+    types_pb2.DT_INT32:
+        np.int32,
+    types_pb2.DT_UINT8:
+        np.uint8,
+    types_pb2.DT_UINT16:
+        np.uint16,
+    types_pb2.DT_UINT32:
+        np.uint32,
+    types_pb2.DT_UINT64:
+        np.uint64,
+    types_pb2.DT_INT16:
+        np.int16,
+    types_pb2.DT_INT8:
+        np.int8,
     # NOTE(touts): For strings we use np.object as it supports variable length
     # strings.
-    types_pb2.DT_STRING: np.object,
-    types_pb2.DT_COMPLEX64: np.complex64,
-    types_pb2.DT_COMPLEX128: np.complex128,
-    types_pb2.DT_INT64: np.int64,
-    types_pb2.DT_BOOL: np.bool,
-    types_pb2.DT_QINT8: _np_qint8,
-    types_pb2.DT_QUINT8: _np_quint8,
-    types_pb2.DT_QINT16: _np_qint16,
-    types_pb2.DT_QUINT16: _np_quint16,
-    types_pb2.DT_QINT32: _np_qint32,
-    types_pb2.DT_BFLOAT16: _np_bfloat16,
+    types_pb2.DT_STRING:
+        np.object,
+    types_pb2.DT_COMPLEX64:
+        np.complex64,
+    types_pb2.DT_COMPLEX128:
+        np.complex128,
+    types_pb2.DT_INT64:
+        np.int64,
+    types_pb2.DT_BOOL:
+        np.bool,
+    types_pb2.DT_QINT8:
+        _np_qint8,
+    types_pb2.DT_QUINT8:
+        _np_quint8,
+    types_pb2.DT_QINT16:
+        _np_qint16,
+    types_pb2.DT_QUINT16:
+        _np_quint16,
+    types_pb2.DT_QINT32:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16:
+        _np_bfloat16,
 
     # Ref types
-    types_pb2.DT_HALF_REF: np.float16,
-    types_pb2.DT_FLOAT_REF: np.float32,
-    types_pb2.DT_DOUBLE_REF: np.float64,
-    types_pb2.DT_INT32_REF: np.int32,
-    types_pb2.DT_UINT32_REF: np.uint32,
-    types_pb2.DT_UINT8_REF: np.uint8,
-    types_pb2.DT_UINT16_REF: np.uint16,
-    types_pb2.DT_INT16_REF: np.int16,
-    types_pb2.DT_INT8_REF: np.int8,
-    types_pb2.DT_STRING_REF: np.object,
-    types_pb2.DT_COMPLEX64_REF: np.complex64,
-    types_pb2.DT_COMPLEX128_REF: np.complex128,
-    types_pb2.DT_INT64_REF: np.int64,
-    types_pb2.DT_UINT64_REF: np.uint64,
-    types_pb2.DT_BOOL_REF: np.bool,
-    types_pb2.DT_QINT8_REF: _np_qint8,
-    types_pb2.DT_QUINT8_REF: _np_quint8,
-    types_pb2.DT_QINT16_REF: _np_qint16,
-    types_pb2.DT_QUINT16_REF: _np_quint16,
-    types_pb2.DT_QINT32_REF: _np_qint32,
-    types_pb2.DT_BFLOAT16_REF: _np_bfloat16,
+    types_pb2.DT_HALF_REF:
+        np.float16,
+    types_pb2.DT_FLOAT_REF:
+        np.float32,
+    types_pb2.DT_DOUBLE_REF:
+        np.float64,
+    types_pb2.DT_INT32_REF:
+        np.int32,
+    types_pb2.DT_UINT32_REF:
+        np.uint32,
+    types_pb2.DT_UINT8_REF:
+        np.uint8,
+    types_pb2.DT_UINT16_REF:
+        np.uint16,
+    types_pb2.DT_INT16_REF:
+        np.int16,
+    types_pb2.DT_INT8_REF:
+        np.int8,
+    types_pb2.DT_STRING_REF:
+        np.object,
+    types_pb2.DT_COMPLEX64_REF:
+        np.complex64,
+    types_pb2.DT_COMPLEX128_REF:
+        np.complex128,
+    types_pb2.DT_INT64_REF:
+        np.int64,
+    types_pb2.DT_UINT64_REF:
+        np.uint64,
+    types_pb2.DT_BOOL_REF:
+        np.bool,
+    types_pb2.DT_QINT8_REF:
+        _np_qint8,
+    types_pb2.DT_QUINT8_REF:
+        _np_quint8,
+    types_pb2.DT_QINT16_REF:
+        _np_qint16,
+    types_pb2.DT_QUINT16_REF:
+        _np_quint16,
+    types_pb2.DT_QINT32_REF:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16_REF:
+        _np_bfloat16,
 }
 
-
-QUANTIZED_DTYPES = frozenset(
-    [qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
-     quint16_ref, qint32_ref])
+QUANTIZED_DTYPES = frozenset([
+    qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
+    quint16_ref, qint32_ref
+])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
 
@@ -613,7 +654,8 @@ def as_dtype(type_value):
   Args:
     type_value: A value that can be converted to a `tf.DType`
       object. This may currently be a `tf.DType` object, a
-      [`DataType` enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
+      [`DataType`
+        enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
       a string type name, or a `numpy.dtype`.
 
   Returns:
@@ -650,5 +692,4 @@ def as_dtype(type_value):
     except TypeError as e:
       raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
 
-  raise TypeError(
-      "Cannot convert value %r to a TensorFlow DType." % type_value)
+  raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index a4ca3f9a89b..b35cee01112 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import re
-import time
 import sys
+import time
 
 import numpy as np
 
@@ -86,6 +86,21 @@ class FunctionTest(test.TestCase):
       with session.Session() as sess:
         self.assertAllEqual([18.0], sess.run(call))
 
+  def testIdentityImplicitDeref(self):
+
+    @function.Defun(dtypes.float32, func_name="MyIdentity")
+    def MyIdentityFunc(a):
+      return a
+
+    with ops.Graph().as_default():
+      var = variables.Variable([18.0])
+      call = MyIdentityFunc(var._ref())  # pylint: disable=protected-access
+      self.assertEqual("MyIdentity", call.op.name)
+      for cfg in _OptimizerOptions():
+        with session.Session(config=cfg) as sess:
+          sess.run(var.initializer)
+          self.assertAllEqual([18.0], sess.run(call))
+
   def testIdentityOutputName(self):
 
     @function.Defun(
@@ -771,7 +786,7 @@ class FunctionTest(test.TestCase):
     # We added more randomness to function names in C API.
     # TODO(iga): Remove this if statement when we switch to C API.
     if ops._USE_C_API:  # pylint: disable=protected-access
-      if sys.byteorder == 'big':
+      if sys.byteorder == "big":
         self.assertEqual("Foo_kEdkAG8SJvg",
                          Foo.instantiate([dtypes.float32] * 3).name)
       else:
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 00fff8d040d..c26644362c8 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A utility function for importing TensorFlow graphs."""
 from __future__ import absolute_import
 from __future__ import division
@@ -43,8 +42,8 @@ from tensorflow.python.util.tf_export import tf_export
 # the logic here.
 def _GetNodeAttr(node_def, attr_name):
   if attr_name not in node_def.attr:
-    raise ValueError('Expected one attr with name %r in %s.'
-                     % (attr_name, str(node_def)))
+    raise ValueError('Expected one attr with name %r in %s.' % (attr_name,
+                                                                str(node_def)))
   return node_def.attr[attr_name]
 
 
@@ -170,9 +169,8 @@ def _ProcessInputMapParam(input_map):
   if input_map is None:
     input_map = {}
   else:
-    if not (isinstance(input_map, dict)
-            and all(isinstance(k, compat.bytes_or_text_types)
-                    for k in input_map.keys())):
+    if not (isinstance(input_map, dict) and all(
+        isinstance(k, compat.bytes_or_text_types) for k in input_map.keys())):
       raise TypeError('input_map must be a dictionary mapping strings to '
                       'Tensor objects.')
   return input_map
@@ -180,9 +178,10 @@ def _ProcessInputMapParam(input_map):
 
 def _ProcessReturnElementsParam(return_elements):
   """Type-checks and possibly canonicalizes `return_elements`."""
-  if return_elements is None: return None
-  if not all(isinstance(x, compat.bytes_or_text_types)
-             for x in return_elements):
+  if return_elements is None:
+    return None
+  if not all(
+      isinstance(x, compat.bytes_or_text_types) for x in return_elements):
     raise TypeError('return_elements must be a list of strings.')
   return tuple(compat.as_str(x) for x in return_elements)
 
@@ -262,14 +261,14 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
     if input_src.startswith('^'):
       src_name = compat.as_bytes(input_src[1:])
       dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
-      c_api.TF_ImportGraphDefOptionsRemapControlDependency(options, src_name,
-                                                           dst_op)
+      c_api.TF_ImportGraphDefOptionsRemapControlDependency(
+          options, src_name, dst_op)
     else:
       src_name, src_idx = _ParseTensorName(input_src)
       src_name = compat.as_str(src_name)
       dst_output = input_dst._as_tf_output()  # pylint: disable=protected-access
-      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name,
-                                                    src_idx, dst_output)
+      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name, src_idx,
+                                                    dst_output)
   for name in return_elements or []:
     if ':' in name:
       op_name, index = _ParseTensorName(name)
@@ -315,8 +314,8 @@ def _ProcessNewOps(graph):
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
         raise ValueError('Specified colocation to an op that '
-                         'does not exist during import: %s in %s' % (
-                             coloc_op_name, op.name))
+                         'does not exist during import: %s in %s' %
+                         (coloc_op_name, op.name))
       if coloc_op.device:
         coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
         break
@@ -373,10 +372,13 @@ def _GatherReturnElements(requested_return_elements, graph, results):
 @tf_export('import_graph_def')
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
-                 ' on this feature.',
-                 'op_dict')
-def import_graph_def(graph_def, input_map=None, return_elements=None,
-                     name=None, op_dict=None, producer_op_list=None):
+                 ' on this feature.', 'op_dict')
+def import_graph_def(graph_def,
+                     input_map=None,
+                     return_elements=None,
+                     name=None,
+                     op_dict=None,
+                     producer_op_list=None):
   """Imports the graph from `graph_def` into the current default `Graph`.
 
   This function provides a way to import a serialized TensorFlow
@@ -480,11 +482,12 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
         c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
             results))
     if missing_unused_input_keys:
-      missing_unused_input_keys = [compat.as_str(s)
-                                   for s in missing_unused_input_keys]
+      missing_unused_input_keys = [
+          compat.as_str(s) for s in missing_unused_input_keys
+      ]
       raise ValueError(
-          'Attempted to map inputs that were not found in graph_def: [%s]'
-          % ', '.join(missing_unused_input_keys))
+          'Attempted to map inputs that were not found in graph_def: [%s]' %
+          ', '.join(missing_unused_input_keys))
 
     if return_elements is None:
       return None
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index b107670275c..ea589cc4d40 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2103,6 +2103,10 @@ class Operation(object):
     logging.warning("Operation._control_inputs is private, use "
                     "Operation.control_inputs instead. "
                     "Operation._control_inputs will eventually be removed.")
+    # Copy value because it may be self._control_inputs_val (in particular if
+    # this is called from self._control_inputs += ...), and we don't want to
+    # clear value below.
+    value = copy.copy(value)
     self._remove_all_control_inputs()
     self._add_control_inputs(value)
 
@@ -2756,15 +2760,12 @@ class Graph(object):
     self._handle_movers = {}
     # A map from tensor handle to its delete op.
     self._handle_deleters = {}
-    # Resource container.
-    if context.in_graph_mode():
-      self._container_prefix = ""
-    else:
-      # In Eager mode, isolate resources (particularly ResourceVariables) in
-      # Graphs by default. This prevents unintended variable sharing. Graph mode
-      # gets this kind of isolation from Sessions.
-      self._container_prefix = "eager-execution-%d/" % (uid(),)
-    self._container = self._container_prefix
+    # Allow optimizers and other objects to pseudo-uniquely key graphs (this key
+    # will be shared when defining function graphs, for example, so optimizers
+    # being called inside function definitions behave as if they were seeing the
+    # actual outside graph).
+    self._graph_key = "grap-key-%d/" % (uid(),)
+    self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
     # TODO(skyewm): fold as much of the above as possible into the C
@@ -4225,7 +4226,7 @@ class Graph(object):
     """
     original_container = self._container
     try:
-      self._container = self._container_prefix + container_name
+      self._container = container_name
       yield self._container
     finally:
       self._container = original_container
@@ -5004,9 +5005,22 @@ def init_scope():
   """
   # pylint: enable=g-doc-return-or-yield,line-too-long
 
+  in_graph_mode = context.in_graph_mode()
+  # Retrieve the active name scope: entering an `init_scope` preserves
+  # the name scope of the current context.
+  if in_graph_mode:
+    default_graph = get_default_graph()
+    scope = default_graph.get_name_scope()
+  else:
+    scope = context.context().scope_name
+  if scope and scope[-1] != '/':
+    # Names that end with trailing slashes are treated by `name_scope` as
+    # absolute.
+    scope = scope + '/'
+
   outer_context = None
-  if context.in_graph_mode() and not _default_graph_stack.stack:
-    outer_context = get_default_graph().as_default
+  if in_graph_mode and not _default_graph_stack.stack:
+    outer_context = default_graph.as_default
   else:
     for stack_entry in reversed(context.context_stack.stack):
       if not stack_entry.is_building_function:
@@ -5018,7 +5032,8 @@ def init_scope():
                          "eager context was previously active.")
 
   try:
-    with outer_context(), control_dependencies(None), tape.stop_recording():
+    with outer_context(), name_scope(scope), control_dependencies(
+        None), tape.stop_recording():
       yield
   finally:
     pass
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 78519f108ba..c5e177d5210 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -2072,10 +2072,34 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       # pylint: disable=protected-access
       self.assertEqual(len(ops._default_graph_stack.stack), 0)
       with ops.init_scope():
-        self.assertEqual(len(ops._default_graph_stack.stack), 1)
+        self.assertGreater(len(ops._default_graph_stack.stack), 0)
       self.assertEqual(len(ops._default_graph_stack.stack), 0)
       # pylint: enable=protected-access
 
+  def testPreservesNameScopeInGraphConstruction(self):
+    with ops.Graph().as_default():
+      function_graph = ops.Graph()
+      with function_graph.as_default():
+        with ops.name_scope("inner"), ops.init_scope():
+          self.assertEqual(ops.get_name_scope(), "inner")
+      self.assertEqual(ops.get_name_scope(), "")
+
+  def testPreservesNameScopeInEagerExecution(self):
+    with context.eager_mode():
+      def foo():
+        with ops.name_scope("inner"), ops.init_scope():
+          if context.in_graph_mode():
+            self.assertEqual(ops.get_name_scope(), "inner")
+          else:
+            # A trailing slash is always appended when eager execution is
+            # enabled.
+            self.assertEqual(context.context().scope_name, "inner/")
+      foo()
+      self.assertEqual(ops.get_name_scope(), "")
+      foo_compiled = eager_function.defun(foo)
+      foo_compiled()
+      self.assertEqual(ops.get_name_scope(), "")
+
 
 @test_util.with_c_api
 class GraphTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 65810fa7094..85cba59be4d 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -476,9 +476,6 @@ GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
 GenPythonOp::~GenPythonOp() {}
 
 string GenPythonOp::Code() {
-  if (api_def_.visibility() == ApiDef::SKIP) {
-    return "";
-  }
   // This has all the input args followed by those attrs that don't have
   // defaults.
   std::vector<ParamNames> params_no_default;
@@ -805,11 +802,21 @@ from tensorflow.python.util.tf_export import tf_export
   auto out = cleaned_ops.mutable_op();
   out->Reserve(ops.op_size());
   for (const auto& op_def : ops.op()) {
-    bool is_hidden = false;
-    for (const string& hidden : hidden_ops) {
-      if (op_def.name() == hidden) {
-        is_hidden = true;
-        break;
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+
+    if (api_def->visibility() == ApiDef::SKIP) {
+      continue;
+    }
+
+    // An op is hidden if either its ApiDef visibility is HIDDEN
+    // or it is in the hidden_ops list.
+    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    if (!is_hidden) {
+      for (const string& hidden : hidden_ops) {
+        if (op_def.name() == hidden) {
+          is_hidden = true;
+          break;
+        }
       }
     }
 
@@ -826,7 +833,6 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
     strings::StrAppend(&result, GetPythonOp(op_def, *api_def, function_name));
 
     if (!require_shapes) {
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index d2b8e803057..0e5f696111a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Utilities to create TensorProtos."""
 from __future__ import absolute_import
 from __future__ import division
@@ -39,6 +38,7 @@ except ImportError:
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
+
 # pylint: enable=g-import-not-at-top
 
 
@@ -47,8 +47,8 @@ def ExtractBitsFromFloat16(x):
 
 
 def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
-  tensor_proto.half_val.extend([
-      ExtractBitsFromFloat16(x) for x in proto_values])
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromFloat16(x) for x in proto_values])
 
 
 def ExtractBitsFromBFloat16(x):
@@ -57,31 +57,47 @@ def ExtractBitsFromBFloat16(x):
 
 
 def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
-  tensor_proto.half_val.extend([
-      ExtractBitsFromBFloat16(x) for x in proto_values])
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromBFloat16(x) for x in proto_values])
 
 
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
-      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
+      dtypes.bfloat16.as_numpy_dtype:
+          SlowAppendBFloat16ArrayToTensorProto,
       # TODO(sesse): We should have a
       # fast_tensor_util.AppendFloat16ArrayToTensorProto,
       # but it seems np.float16_t doesn't exist?
-      np.float16: SlowAppendFloat16ArrayToTensorProto,
-      np.float32: fast_tensor_util.AppendFloat32ArrayToTensorProto,
-      np.float64: fast_tensor_util.AppendFloat64ArrayToTensorProto,
-      np.int32: fast_tensor_util.AppendInt32ArrayToTensorProto,
-      np.int64: fast_tensor_util.AppendInt64ArrayToTensorProto,
-      np.uint8: fast_tensor_util.AppendUInt8ArrayToTensorProto,
-      np.uint16: fast_tensor_util.AppendUInt16ArrayToTensorProto,
-      np.uint32: fast_tensor_util.AppendUInt32ArrayToTensorProto,
-      np.uint64: fast_tensor_util.AppendUInt64ArrayToTensorProto,
-      np.int8: fast_tensor_util.AppendInt8ArrayToTensorProto,
-      np.int16: fast_tensor_util.AppendInt16ArrayToTensorProto,
-      np.complex64: fast_tensor_util.AppendComplex64ArrayToTensorProto,
-      np.complex128: fast_tensor_util.AppendComplex128ArrayToTensorProto,
-      np.object: fast_tensor_util.AppendObjectArrayToTensorProto,
-      np.bool: fast_tensor_util.AppendBoolArrayToTensorProto,
+      np.float16:
+          SlowAppendFloat16ArrayToTensorProto,
+      np.float32:
+          fast_tensor_util.AppendFloat32ArrayToTensorProto,
+      np.float64:
+          fast_tensor_util.AppendFloat64ArrayToTensorProto,
+      np.int32:
+          fast_tensor_util.AppendInt32ArrayToTensorProto,
+      np.int64:
+          fast_tensor_util.AppendInt64ArrayToTensorProto,
+      np.uint8:
+          fast_tensor_util.AppendUInt8ArrayToTensorProto,
+      np.uint16:
+          fast_tensor_util.AppendUInt16ArrayToTensorProto,
+      np.uint32:
+          fast_tensor_util.AppendUInt32ArrayToTensorProto,
+      np.uint64:
+          fast_tensor_util.AppendUInt64ArrayToTensorProto,
+      np.int8:
+          fast_tensor_util.AppendInt8ArrayToTensorProto,
+      np.int16:
+          fast_tensor_util.AppendInt16ArrayToTensorProto,
+      np.complex64:
+          fast_tensor_util.AppendComplex64ArrayToTensorProto,
+      np.complex128:
+          fast_tensor_util.AppendComplex128ArrayToTensorProto,
+      np.object:
+          fast_tensor_util.AppendObjectArrayToTensorProto,
+      np.bool:
+          fast_tensor_util.AppendBoolArrayToTensorProto,
       dtypes.qint8.as_numpy_dtype:
           fast_tensor_util.AppendInt8ArrayToTensorProto,
       dtypes.quint8.as_numpy_dtype:
@@ -118,14 +134,12 @@ else:
     tensor_proto.uint64_val.extend([np.asscalar(x) for x in proto_values])
 
   def SlowAppendComplex64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.scomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.scomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendComplex128ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.dcomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.dcomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendObjectArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
@@ -252,15 +266,16 @@ def _FilterTuple(v):
       return None
   if isinstance(v, list):
     if not any(isinstance(x, (list, tuple)) for x in v):
-      return _FirstNotNone([None if isinstance(x, (list, tuple)) else x for x in v])
+      return _FirstNotNone(
+          [None if isinstance(x, (list, tuple)) else x for x in v])
   return _FirstNotNone([_FilterTuple(x) for x in v])
 
 
 def _FilterInt(v):
   if isinstance(v, (list, tuple)):
     return _FirstNotNone([_FilterInt(x) for x in v])
-  return None if isinstance(v, (compat.integral_types,
-                                tensor_shape.Dimension)) else _NotNone(v)
+  return None if isinstance(
+      v, (compat.integral_types, tensor_shape.Dimension)) else _NotNone(v)
 
 
 def _FilterFloat(v):
@@ -380,8 +395,11 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   if dtype:
     dtype = dtypes.as_dtype(dtype)
 
-  is_quantized = (dtype in [dtypes.qint8, dtypes.quint8, dtypes.qint16,
-                            dtypes.quint16, dtypes.qint32])
+  is_quantized = (
+      dtype in [
+          dtypes.qint8, dtypes.quint8, dtypes.qint16, dtypes.quint16,
+          dtypes.qint32
+      ])
 
   # We first convert value to a numpy array or scalar.
   if isinstance(values, (np.ndarray, np.generic)):
@@ -419,9 +437,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       if (list(nparray.shape) != _GetDenseDimensions(values) and
           not is_quantized):
         raise ValueError("""Argument must be a dense tensor: %s"""
-                         """ - got shape %s, but wanted %s.""" % (
-                             values, list(nparray.shape),
-                             _GetDenseDimensions(values)))
+                         """ - got shape %s, but wanted %s.""" %
+                         (values, list(nparray.shape),
+                          _GetDenseDimensions(values)))
 
     # python/numpy default float type is float64. We prefer float32 instead.
     if (nparray.dtype == np.float64) and dtype is None:
@@ -446,8 +464,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   if dtype is not None and (not hasattr(dtype, "base_dtype") or
                             dtype.base_dtype != numpy_dtype.base_dtype):
-    raise TypeError("Incompatible types: %s vs. %s. Value is %s"
-                    % (dtype, nparray.dtype, values))
+    raise TypeError("Incompatible types: %s vs. %s. Value is %s" %
+                    (dtype, nparray.dtype, values))
 
   # If shape is not given, get the shape from the numpy array.
   if shape is None:
@@ -510,8 +528,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   append_fn = GetNumpyAppendFn(proto_values.dtype)
   if append_fn is None:
-    raise TypeError("Element type not supported in TensorProto: %s" %
-                    numpy_dtype.name)
+    raise TypeError(
+        "Element type not supported in TensorProto: %s" % numpy_dtype.name)
   append_fn(tensor_proto, proto_values)
 
   return tensor_proto
@@ -553,19 +571,23 @@ def MakeNdarray(tensor):
       return tmp.reshape(shape)
   elif tensor_dtype == dtypes.float32:
     if len(tensor.float_val) == 1:
-      return np.repeat(np.array(tensor.float_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.float_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.float_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.float64:
     if len(tensor.double_val) == 1:
-      return np.repeat(np.array(tensor.double_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.double_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape)
-  elif tensor_dtype in [dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16,
-                        dtypes.int8, dtypes.qint32, dtypes.quint8, dtypes.qint8,
-                        dtypes.qint16, dtypes.quint16, dtypes.bfloat16]:
+  elif tensor_dtype in [
+      dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8,
+      dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16,
+      dtypes.bfloat16
+  ]:
     if len(tensor.int_val) == 1:
       return np.repeat(np.array(tensor.int_val[0], dtype=dtype),
                        num_elements).reshape(shape)
@@ -573,35 +595,41 @@ def MakeNdarray(tensor):
       return np.fromiter(tensor.int_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.int64:
     if len(tensor.int64_val) == 1:
-      return np.repeat(np.array(tensor.int64_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.int64_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.int64_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.string:
     if len(tensor.string_val) == 1:
-      return np.repeat(np.array(tensor.string_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.string_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
-      return np.array([x for x in tensor.string_val],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [x for x in tensor.string_val], dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex64:
     it = iter(tensor.scomplex_val)
     if len(tensor.scomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.scomplex_val[0],
-                                        tensor.scomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.scomplex_val[0], tensor.scomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex128:
     it = iter(tensor.dcomplex_val)
     if len(tensor.dcomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.dcomplex_val[0],
-                                        tensor.dcomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.dcomplex_val[0], tensor.dcomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.bool:
     if len(tensor.bool_val) == 1:
       return np.repeat(np.array(tensor.bool_val[0], dtype=dtype),
@@ -645,8 +673,9 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Shape":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.is_fully_defined():
-      return np.array([dim.value for dim in input_shape.dims],
-                      dtype=tensor.dtype.as_numpy_dtype)
+      return np.array(
+          [dim.value for dim in input_shape.dims],
+          dtype=tensor.dtype.as_numpy_dtype)
     else:
       return None
   elif tensor.op.type == "Size":
@@ -658,8 +687,10 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Rank":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.ndims is not None:
-      return np.ndarray(shape=(), buffer=np.array([input_shape.ndims], dtype=np.int32),
-                        dtype=np.int32)
+      return np.ndarray(
+          shape=(),
+          buffer=np.array([input_shape.ndims], dtype=np.int32),
+          dtype=np.int32)
     else:
       return None
   elif tensor.op.type == "Range":
@@ -861,8 +892,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
         new_axis_mask = tensor.op.get_attr("new_axis_mask")
         shrink_axis_mask = tensor.op.get_attr("shrink_axis_mask")
         valid_attributes = (not ellipsis_mask and not new_axis_mask and
-                            not shrink_axis_mask and
-                            (not begin_mask or (begin_mask == 1)) and
+                            not shrink_axis_mask and (not begin_mask or
+                                                      (begin_mask == 1)) and
                             (not end_mask or (end_mask == 1)))
         if valid_attributes:  # additional inputs not supported
           prev = constant_value_as_shape(tensor.op.inputs[0])
@@ -878,8 +909,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   ret = tensor_shape.unknown_shape(shape[0].value)
   value = constant_value(tensor)
   if value is not None:
-    ret = ret.merge_with(tensor_shape.TensorShape(
-        [d if d >= 0 else None for d in value]))
+    ret = ret.merge_with(
+        tensor_shape.TensorShape([d if d >= 0 else None for d in value]))
   return ret
 
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 6a7e1d0c89c..15e8f5a38d6 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -123,11 +123,11 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     TypeError: If either argument is not a `GraphDef`.
   """
   if not isinstance(actual, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for actual, got %s" %
-                    type(actual).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
   if not isinstance(expected, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for expected, got %s" %
-                    type(expected).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for expected, got %s" % type(expected).__name__)
 
   if checkpoint_v2:
     _strip_checkpoint_v2_randomized(actual)
@@ -152,11 +152,10 @@ def assert_meta_graph_protos_equal(tester, a, b):
       a_proto = proto_type()
       b_proto = proto_type()
       # Number of entries in the collections is the same
-      tester.assertEqual(len(a_value.bytes_list.value),
-                         len(b_value.bytes_list.value))
-      for (a_value_item, b_value_item) in zip(
-          a_value.bytes_list.value,
-          b_value.bytes_list.value):
+      tester.assertEqual(
+          len(a_value.bytes_list.value), len(b_value.bytes_list.value))
+      for (a_value_item, b_value_item) in zip(a_value.bytes_list.value,
+                                              b_value.bytes_list.value):
         a_proto.ParseFromString(a_value_item)
         b_proto.ParseFromString(b_value_item)
         tester.assertProtoEquals(a_proto, b_proto)
@@ -220,10 +219,7 @@ def NHWCToNCHW(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 3, 1, 2],
-      5: [0, 4, 1, 2, 3]
-  }
+  new_axes = {4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -250,8 +246,9 @@ def NHWCToNCHW_VECT_C(input_shape_or_tensor):
   """
   permutations = {5: [0, 3, 1, 2, 4], 6: [0, 4, 1, 2, 3, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  temp_shape = (input_shape_or_tensor.shape.as_list()
-                if is_tensor else input_shape_or_tensor)
+  temp_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if temp_shape[-1] % 4 != 0:
     raise ValueError(
         "Last dimension of input must be evenly divisible by 4 to convert to "
@@ -283,8 +280,9 @@ def NCHW_VECT_CToNHWC(input_shape_or_tensor):
   """
   permutations = {5: [0, 2, 3, 1, 4], 6: [0, 2, 3, 4, 1, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  input_shape = (input_shape_or_tensor.shape.as_list()
-                 if is_tensor else input_shape_or_tensor)
+  input_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if input_shape[-1] != 4:
     raise ValueError("Last dimension of NCHW_VECT_C must be 4.")
   permutation = permutations[len(input_shape)]
@@ -307,10 +305,7 @@ def NCHWToNHWC(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 2, 3, 1],
-      5: [0, 2, 3, 4, 1]
-  }
+  new_axes = {4: [0, 2, 3, 1], 5: [0, 2, 3, 4, 1]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -325,10 +320,17 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
   prev_value = ops._USE_C_API
   ops._USE_C_API = use_c_api
   try:
-    with ops.Graph().as_default():
-      fn(*args, **kwargs)
+    # Reset the default graph so it has the C API enabled. We call
+    # reset_default_graph() instead of creating a new default Graph context to
+    # make this robust to tests that call reset_default_graph(), which requires
+    # that the current default graph isn't nested.
+    ops.reset_default_graph()
+    fn(*args, **kwargs)
   finally:
     ops._USE_C_API = prev_value
+    # Make sure default graph reflects prev_value in case next test doesn't call
+    # reset_default_graph().
+    ops.reset_default_graph()
 # pylint: disable=protected-access
 
 
@@ -345,7 +347,9 @@ def skip_if(condition):
   Returns:
     The wrapped function
   """
+
   def real_skip_if(fn):
+
     def wrapper(*args, **kwargs):
       if callable(condition):
         skip = condition()
@@ -353,7 +357,9 @@ def skip_if(condition):
         skip = condition
       if not skip:
         fn(*args, **kwargs)
+
     return wrapper
+
   return real_skip_if
 
 
@@ -370,8 +376,10 @@ def disable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, False, *args, **kwargs)
+
   return wrapper
 
 
@@ -388,8 +396,10 @@ def enable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, True, *args, **kwargs)
+
   return wrapper
 
 
@@ -415,66 +425,6 @@ def with_c_api(cls):
   return cls
 
 
-class IsolateTest(object):
-  """A context manager which isolates resources in its block.
-
-  Provides an Eager-agnostic abstraction for preventing the sharing of
-  variables and other resources.
-
-  In graph mode, resource handle ops are only executed in a particular Session,
-  isolating them from resources with the same name in other Graphs. In Eager,
-  separate Sessions do not exist, so resources (particularly ResourceVariables)
-  would be shared implicitly if a resource of the same name were created
-  anywhere in a Python process. Multiple handles to the same resource would
-  cause several issues, and so this type of sharing will raise an exception.
-
-  Using resources with the same name in a single Python process may be useful
-  (especially for unit tests), so this context manager provides an abstraction
-  for isolating resources. Using a resource created in one Isolation environment
-  in another is an error.
-
-  Example usage in Eager mode:
-
-  ```python
-  import tensorflow as tf
-  # Import subject to change
-  from tensorflow.contrib.eager.python import tfe
-
-  tfe.enable_eager_execution()
-
-  for hyperparameter in [1, 2, 3]:
-    with tfe.IsolateTest():
-      v = tfe.Variable(name="v", initial_value=hyperparameter)
-      # train model, test results ...
-  ```
-
-  IsolateTest is currently exposed through contrib.eager, but it creates a new
-  default Graph and provides equivalent safety in graph mode.
-  """
-
-  def __init__(self):
-    if context.in_eager_mode() and tape.could_possibly_record():
-      raise ValueError("Cannot isolate Eager execution with an active tape.")
-    # In Eager, Graphs set a container which isolates resources, and maintain a
-    # VariableStore which caches ResourceVariable objects created through
-    # get_variable. So setting the default Graph has the side effect of
-    # isolating Eager resources.
-    with context.eager_mode():
-      # Create the graph in Eager mode, as this provides stricter semantics
-      # (i.e. has a unique container prefix). This prevents implicit sharing
-      # when a Graph-mode graph is created and then Eager mode is enabled (an
-      # error through enable_eager_execution, but common with context managers
-      # in unit tests).
-      self._graph_as_default_context_manager = ops.Graph().as_default()
-
-  def __enter__(self):
-    self._graph_as_default_context_manager.__enter__()
-
-  def __exit__(self, type_arg, value_arg, traceback_arg):
-    return self._graph_as_default_context_manager.__exit__(
-        type_arg, value_arg, traceback_arg)
-
-
 def assert_no_new_tensors(f):
   """Decorator for asserting that no new Tensors persist after a test.
 
@@ -505,12 +455,11 @@ def assert_no_new_tensors(f):
         return False
 
     tensors_before = set(id(obj) for obj in gc.get_objects() if _is_tensor(obj))
-    outside_container_prefix = ops.get_default_graph()._container_prefix
-    with IsolateTest():
+    outside_graph_key = ops.get_default_graph()._graph_key
+    with ops.Graph().as_default():
       # Run the test in a new graph so that collections get cleared when it's
-      # done, but inherit the container prefix so that we can print the values
-      # of variables which get leaked when executing eagerly.
-      ops.get_default_graph()._container_prefix = outside_container_prefix
+      # done, but inherit the graph key so optimizers behave.
+      ops.get_default_graph()._graph_key = outside_graph_key
       f(self, **kwargs)
     # Make an effort to clear caches, which would otherwise look like leaked
     # Tensors.
@@ -561,13 +510,17 @@ def assert_no_garbage_created(f):
     # not hold on to every object in other tests.
     gc.set_debug(previous_debug_flags)
     gc.enable()
+
   return decorator
 
 
-def run_in_graph_and_eager_modes(
-    __unused__=None, graph=None, config=None,
-    use_gpu=False, force_gpu=False,
-    reset_test=True, assert_no_eager_garbage=False):
+def run_in_graph_and_eager_modes(__unused__=None,
+                                 graph=None,
+                                 config=None,
+                                 use_gpu=False,
+                                 force_gpu=False,
+                                 reset_test=True,
+                                 assert_no_eager_garbage=False):
   """Runs the test in both graph and eager modes.
 
   Args:
@@ -596,6 +549,7 @@ def run_in_graph_and_eager_modes(
 
   def decorator(f):
     """Test method decorator."""
+
     def decorated(self, **kwargs):
       """Decorated the test method."""
       with context.graph_mode():
@@ -627,10 +581,11 @@ def run_in_graph_and_eager_modes(
             assert_no_garbage_created(run_eager_mode))
 
       with context.eager_mode():
-        with IsolateTest():
+        with ops.Graph().as_default():
           run_eager_mode(self, **kwargs)
 
     return decorated
+
   return decorator
 
 
@@ -767,8 +722,10 @@ class TensorFlowTestCase(googletest.TestCase):
       self._AssertProtoEquals(expected_message, message)
     elif isinstance(expected_message_maybe_ascii, str):
       expected_message = type(message)()
-      text_format.Merge(expected_message_maybe_ascii, expected_message,
-                        descriptor_pool=descriptor_pool.Default())
+      text_format.Merge(
+          expected_message_maybe_ascii,
+          expected_message,
+          descriptor_pool=descriptor_pool.Default())
       self._AssertProtoEquals(expected_message, message)
     else:
       assert False, ("Can't compare protos of type %s and %s" %
@@ -852,7 +809,8 @@ class TensorFlowTestCase(googletest.TestCase):
     trigger the creation of a new session.
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu`
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
+    `use_gpu`
     is True, TensorFlow tries to run as many ops on the GPU as possible. If both
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
@@ -1051,6 +1009,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._threads.append(ret)
     return ret
 
+
 # pylint: enable=invalid-name
 
   def assertNear(self, f1, f2, err, msg=None):
@@ -1118,7 +1077,8 @@ class TensorFlowTestCase(googletest.TestCase):
       # the absolute difference between a and b.  Here, we want to
       # print out which elements violate such conditions.
       cond = np.logical_or(
-          np.abs(a - b) > atol + rtol * np.abs(b), np.isnan(a) != np.isnan(b))
+          np.abs(a - b) > atol + rtol * np.abs(b),
+          np.isnan(a) != np.isnan(b))
       if a.ndim:
         x = a[np.where(cond)]
         y = b[np.where(cond)]
@@ -1380,8 +1340,11 @@ class TensorFlowTestCase(googletest.TestCase):
 
 
 @tf_export("test.create_local_cluster")
-def create_local_cluster(num_workers, num_ps, protocol="grpc",
-                         worker_config=None, ps_config=None):
+def create_local_cluster(num_workers,
+                         num_ps,
+                         protocol="grpc",
+                         worker_config=None,
+                         ps_config=None):
   """Create and start local servers and return the associated `Server` objects.
 
   Example:
@@ -1431,15 +1394,21 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc",
 
   workers = [
       server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix,
-          config=worker_config, start=True)
-      for ix in range(num_workers)
+          cs,
+          job_name="worker",
+          protocol=protocol,
+          task_index=ix,
+          config=worker_config,
+          start=True) for ix in range(num_workers)
   ]
   ps_servers = [
       server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix,
-          config=ps_config, start=True)
-      for ix in range(num_ps)
+          cs,
+          job_name="ps",
+          protocol=protocol,
+          task_index=ix,
+          config=ps_config,
+          start=True) for ix in range(num_ps)
   ]
 
   return workers, ps_servers
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 3594d125bf6..a717eb39513 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -29,7 +29,6 @@ from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
@@ -39,7 +38,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -443,71 +441,5 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
     LeakedTensorTest().test_has_no_leak()
 
 
-@test_util.with_c_api
-class IsolationTest(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_variable_reuse_exception(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="first_container_variable",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-    with test_util.IsolateTest():
-      if context.in_graph_mode():
-        with self.assertRaises(RuntimeError):
-          self.evaluate(first_container_variable.read_value())
-      else:
-        with self.assertRaises(ValueError):
-          first_container_variable.read_value()
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_variable_reuse_exception_nested(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="first_container_variable",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-      with test_util.IsolateTest(), session.Session():
-        if context.in_graph_mode():
-          with self.assertRaises(RuntimeError):
-            self.evaluate(first_container_variable.read_value())
-        else:
-          with self.assertRaises(ValueError):
-            first_container_variable.read_value()
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_no_sharing(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="same_name",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-      with test_util.IsolateTest(), session.Session():
-        second_container_variable = resource_variable_ops.ResourceVariable(
-            name="same_name",
-            initial_value=2)
-        if context.in_graph_mode():
-          self.evaluate([variables.global_variables_initializer()])
-        self.assertEqual(
-            2, self.evaluate(second_container_variable.read_value()))
-      self.assertEqual(1, self.evaluate(first_container_variable.read_value()))
-
-  def test_graph_mode_isolation(self):
-    with context.graph_mode():
-      # Even if we've (accidentally) called IsolateTest in Graph mode, it should
-      # provide Eager isolation.
-      with test_util.IsolateTest():
-        with context.eager_mode():
-          first_container_variable = resource_variable_ops.ResourceVariable(
-              name="first_container_variable",
-              initial_value=1)
-      with context.eager_mode():
-        with self.assertRaises(ValueError):
-          first_container_variable.read_value()
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 578f86ca5a0..5bc9e4b8030 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -157,6 +157,7 @@ def _get_config(layout_optimizer=True):
   graph_options = config_pb2.GraphOptions(
       rewrite_options=rewrite_options, build_cost_model=1)
   config = config_pb2.ConfigProto(graph_options=graph_options)
+  config.graph_options.optimizer_options.opt_level = -1
   return config
 
 
@@ -179,6 +180,8 @@ def _get_cluster():
   named_device = device_properties_pb2.NamedDevice()
   named_device.name = '/GPU:0'
   named_device.properties.type = 'GPU'
+  named_device.properties.num_cores = 24
+  named_device.properties.frequency = 1000
   named_device.properties.environment['architecture'] = '4'
   cluster = gcluster.Cluster(devices=[named_device])
   return cluster
@@ -1169,7 +1172,7 @@ class LayoutOptimizerTest(test.TestCase):
           num_transposes += 1
         nodes.append(node.name)
 
-      expected_num_transposes = 2
+      expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
       self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 61257557751..fdac22bb53c 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -39,6 +39,7 @@ py_library(
         "_impl/keras/engine/__init__.py",
         "_impl/keras/engine/topology.py",
         "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_eager.py",
         "_impl/keras/estimator.py",
         "_impl/keras/initializers.py",
         "_impl/keras/layers/__init__.py",
@@ -481,6 +482,7 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -719,6 +721,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "training_eager_test",
+    size = "medium",
+    srcs = ["_impl/keras/engine/training_eager_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "topology_test",
     size = "small",
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 460c0dc5f39..098ea063f95 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -29,6 +29,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
@@ -326,7 +327,15 @@ def learning_phase():
 
   Returns:
       Learning phase (scalar integer tensor or Python integer).
+
+  Raises:
+      ValueError: If called when Eager execution is enabled.
   """
+  if context.in_eager_mode():
+    if 'eager' not in _GRAPH_LEARNING_PHASES:
+      raise ValueError('No learning phase set in Eager mode.')
+    return _GRAPH_LEARNING_PHASES['eager']
+
   graph = ops.get_default_graph()
   if graph not in _GRAPH_LEARNING_PHASES:
     phase = array_ops.placeholder_with_default(
@@ -347,7 +356,10 @@ def set_learning_phase(value):
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
     raise ValueError('Expected learning phase to be ' '0 or 1.')
-  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
+  if context.in_eager_mode():
+    _GRAPH_LEARNING_PHASES['eager'] = value
+  else:
+    _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
 
 
 def get_session():
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index 64aa868f382..8354a2b8fd7 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -708,8 +708,10 @@ class Network(tf_network.GraphNetwork, Layer):
       self.input_names.append(layer.name)
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
-        self._feed_inputs.append(layer.input)
         self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
+        # layer.input gives an error in eager mode
+        if context.in_graph_mode():
+          self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 699ae2edf0d..43d95b1f194 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -22,17 +22,21 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras import optimizers
+from tensorflow.python.keras._impl.keras.engine import training_eager
 from tensorflow.python.keras._impl.keras.engine.topology import Network
 from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import optimizer as tf_optimizer_module
 
 try:
   from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
@@ -82,21 +86,24 @@ def _standardize_input_data(data,
           if data[x].__class__.__name__ == 'DataFrame' else data[x]
           for x in names
       ]
-      data = [np.expand_dims(x, 1) if x.ndim == 1 else x for x in data]
     except KeyError as e:
       raise ValueError('No data provided for "' + e.args[0] + '". Need data '
                        'for each key in: ' + str(names))
   elif isinstance(data, list):
-    data = [
-        x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
-    ]
-    data = [
-        np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x
-        for x in data
-    ]
+    if isinstance(data[0], list):
+      data = [np.asarray(d) for d in data]
+    elif len(names) == 1 and isinstance(data[0], (float, int)):
+      data = [np.asarray(data)]
+    else:
+      data = [
+          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
+      ]
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
-    data = [np.expand_dims(data, 1)] if data.ndim == 1 else [data]
+    data = [data]
+  data = [
+      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
+  ]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
@@ -618,9 +625,15 @@ class Model(Network):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     loss = loss or {}
+    if context.in_eager_mode() and  not isinstance(
+        optimizer, tf_optimizer_module.Optimizer):
+      raise ValueError('Only TF native optimizers are supported in Eager mode.')
+
     self.optimizer = optimizers.get(optimizer)
     self.loss = loss
     self.loss_weights = loss_weights
+    if context.in_eager_mode() and sample_weight_mode is not None:
+      raise ValueError('sample_weight_mode is not supported in Eager mode.')
     self.sample_weight_mode = sample_weight_mode
 
     # Prepare loss functions.
@@ -651,6 +664,7 @@ class Model(Network):
       loss_function = losses.get(loss)
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
+
     weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions]
     skip_target_indices = []
     skip_target_weighing_indices = []
@@ -664,11 +678,12 @@ class Model(Network):
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    masks = self.compute_mask(self.inputs, mask=None)
-    if masks is None:
-      masks = [None for _ in self.outputs]
-    if not isinstance(masks, list):
-      masks = [masks]
+    if context.in_graph_mode():
+      masks = self.compute_mask(self.inputs, mask=None)
+      if masks is None:
+        masks = [None for _ in self.outputs]
+      if not isinstance(masks, list):
+        masks = [masks]
 
     # Prepare loss weights.
     if loss_weights is None:
@@ -694,6 +709,32 @@ class Model(Network):
     else:
       raise TypeError('Could not interpret loss_weights argument: ' +
                       str(loss_weights) + ' - expected a list of dicts.')
+    self.loss_weights_list = loss_weights_list
+
+    # initialization for Eager mode execution
+    if context.in_eager_mode():
+      if target_tensors is not None:
+        raise ValueError('target_tensors are not currently supported in Eager'
+                         'mode.')
+      self.total_loss = None
+      self.metrics = metrics
+      self.weighted_metrics = weighted_metrics
+      self.metrics_tensors = []
+      self.metrics_names = ['loss']
+      for i in range(len(self.outputs)):
+        if len(self.outputs) > 1:
+          self.metrics_names.append(self.output_names[i] + '_loss')
+      self.nested_metrics = _collect_metrics(metrics, self.output_names)
+      self._feed_sample_weight_modes = []
+      for i in range(len(self.outputs)):
+        self._feed_sample_weight_modes.append(None)
+      self.sample_weights = []
+      self.targets = []
+      self._collected_trainable_weights = self.trainable_weights
+      for i in range(len(self.outputs)):
+        self._feed_output_names.append(self.output_names[i])
+
+      return
 
     # Prepare targets of model.
     self.targets = []
@@ -720,6 +761,7 @@ class Model(Network):
       else:
         raise TypeError('Expected `target_tensors` to be '
                         'a list or dict, but got:', target_tensors)
+
     for i in range(len(self.outputs)):
       if i in skip_target_indices:
         self.targets.append(None)
@@ -769,7 +811,7 @@ class Model(Network):
             weight = K.placeholder(ndim=2, name=name + '_sample_weights')
             sample_weight_modes.append('temporal')
           else:
-            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            weight = K.placeholder(ndim=1, name=name + 'sample_weights')
             sample_weight_modes.append(None)
         sample_weights.append(weight)
     elif isinstance(sample_weight_mode, list):
@@ -929,7 +971,7 @@ class Model(Network):
     self._feed_sample_weights = []
     for i in range(len(self.sample_weights)):
       if i not in skip_target_weighing_indices:
-        self._feed_sample_weights.append(sample_weights[i])
+        self._feed_sample_weights.append(self.sample_weights[i])
 
     # Functions for train, test and predict will
     # be compiled lazily when required.
@@ -978,6 +1020,7 @@ class Model(Network):
         with K.name_scope(self.optimizer.__class__.__name__):
           training_updates = self.optimizer.get_updates(
               params=self._collected_trainable_weights, loss=self.total_loss)
+
         updates = self.updates + training_updates
         # Gets loss and metrics. Updates weights at each call.
         self.train_function = K.function(
@@ -1156,6 +1199,7 @@ class Model(Network):
       callback_model = self
 
     callbacks.set_model(callback_model)
+
     callbacks.set_params({
         'batch_size': batch_size,
         'epochs': epochs,
@@ -1216,6 +1260,7 @@ class Model(Network):
           np.random.shuffle(index_array)
 
         batches = _make_batches(num_train_samples, batch_size)
+
         for batch_index, (batch_start, batch_end) in enumerate(batches):
           batch_ids = index_array[batch_start:batch_end]
           try:
@@ -1410,6 +1455,7 @@ class Model(Network):
           ins_batch[i] = ins_batch[i].toarray()
 
         batch_outs = f(ins_batch)
+
         if isinstance(batch_outs, list):
           if batch_index == 0:
             for batch_out in enumerate(batch_outs):
@@ -1420,7 +1466,6 @@ class Model(Network):
           if batch_index == 0:
             outs.append(0.)
           outs[0] += batch_outs * len(batch_ids)
-
         if verbose == 1:
           progbar.update(batch_end)
       for i in range(len(outs)):
@@ -1636,6 +1681,7 @@ class Model(Network):
         batch_size=batch_size)
     # Prepare validation data.
     do_validation = False
+    val_ins = []
     if validation_data:
       do_validation = True
       if len(validation_data) == 2:
@@ -1686,39 +1732,65 @@ class Model(Network):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    f = self.train_function
 
     # Prepare display labels.
     out_labels = self._get_deduped_metrics_names()
 
-    if do_validation:
-      self._make_test_function()
-      val_f = self.test_function
-      callback_metrics = copy.copy(out_labels) + [
-          'val_' + n for n in out_labels
-      ]
-    else:
-      callback_metrics = copy.copy(out_labels)
-      val_f = None
-      val_ins = []
+    if context.in_eager_mode():
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
 
-    # Delegate logic to `_fit_loop`.
-    return self._fit_loop(
-        f,
-        ins,
-        out_labels=out_labels,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        val_f=val_f,
-        val_ins=val_ins,
-        shuffle=shuffle,
-        callback_metrics=callback_metrics,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps)
+      return training_eager.fit_loop(
+          self,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
+    else:
+      self._make_train_function()
+      f = self.train_function
+
+      if do_validation:
+        if context.in_graph_mode():
+          self._make_test_function()
+          val_f = self.test_function
+        else:
+          val_f = None
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        val_f = None
+        callback_metrics = copy.copy(out_labels)
+
+      # Delegate logic to `_fit_loop`.
+      return self._fit_loop(
+          f,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_f=val_f,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
 
   def evaluate(self,
                x=None,
@@ -1794,10 +1866,15 @@ class Model(Network):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    f = self.test_function
-    return self._test_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.test_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_test_function()
+      f = self.test_function
+      return self._test_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def predict(self, x, batch_size=None, verbose=0, steps=None):
     """Generates output predictions for the input samples.
@@ -1849,10 +1926,16 @@ class Model(Network):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    f = self.predict_function
-    return self._predict_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.predict_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_predict_function()
+      f = self.predict_function
+
+      return self._predict_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
     """Runs a single gradient update on a single batch of data.
@@ -1888,6 +1971,7 @@ class Model(Network):
         or list of scalars (if the model has multiple outputs
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
+
     """
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1899,11 +1983,16 @@ class Model(Network):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    outputs = self.train_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      return training_eager.train_on_batch(self, ins)
+
+    if context.in_graph_mode():
+      self._make_train_function()
+      outputs = self.train_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def test_on_batch(self, x, y, sample_weight=None):
     """Test the model on a single batch of samples.
@@ -1942,11 +2031,16 @@ class Model(Network):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    outputs = self.test_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      return training_eager.test_on_batch(self, ins)
+
+    if context.in_graph_mode():
+      self._make_test_function()
+      outputs = self.test_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def predict_on_batch(self, x):
     """Returns predictions for a single batch of samples.
@@ -1956,6 +2050,7 @@ class Model(Network):
 
     Returns:
         Numpy array(s) of predictions.
+
     """
     x = _standardize_input_data(x, self._feed_input_names,
                                 self._feed_input_shapes)
@@ -1963,11 +2058,25 @@ class Model(Network):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    outputs = self.predict_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      ins_batch_converted = []
+      for ib in ins:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+      eager_model_inputs = []
+      for i in range(len(self.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      outs = self(eager_model_inputs)  # pylint: disable=not-callable
+      return outs
+
+    if context.in_graph_mode():
+      self._make_predict_function()
+      outputs = self.predict_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def fit_generator(self,
                     generator,
@@ -2072,7 +2181,6 @@ class Model(Network):
         model.fit_generator(generate_arrays_from_file('/my_file.txt'),
                             steps_per_epoch=10000, epochs=10)
     ```
-
     Raises:
         ValueError: In case the generator yields
             data in an invalid format.
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
new file mode 100644
index 00000000000..0a115969ca6
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -0,0 +1,666 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from tensorflow.python.eager.backprop import GradientTape
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import callbacks as cbks
+from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.keras._impl.keras import metrics as metrics_module
+from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+
+
+def _make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  Arguments:
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+
+  Returns:
+      A list of tuples of array indices.
+  """
+  num_batches = int(np.ceil(size / float(batch_size)))
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(0, num_batches)]
+
+
+def _slice_arrays(arrays, start=None, stop=None):
+  """Slice an array or list of arrays.
+
+  This takes an array-like, or a list of
+  array-likes, and outputs:
+      - arrays[start:stop] if `arrays` is an array-like
+      - [x[start:stop] for x in arrays] if `arrays` is a list
+
+  Can also work on list/array of indices: `_slice_arrays(x, indices)`
+
+  Arguments:
+      arrays: Single array or list of arrays.
+      start: can be an integer index (start index)
+          or a list/array of indices
+      stop: integer (stop index); should be None if
+          `start` was a list.
+
+  Returns:
+      A slice of the array(s).
+
+  Raises:
+      ValueError: If the value of start is a list and stop is not None.
+  """
+  if arrays is None:
+    return [None]
+  if isinstance(start, list) and stop is not None:
+    raise ValueError('The stop argument has to be None if the value of start is'
+                     'a list.')
+  elif isinstance(arrays, list):
+    if hasattr(start, '__len__'):
+      # hdf5 datasets only support list objects as indices
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return [None if x is None else x[start] for x in arrays]
+    else:
+      return [None if x is None else x[start:stop] for x in arrays]
+  else:
+    if hasattr(start, '__len__'):
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return arrays[start]
+    elif hasattr(start, '__getitem__'):
+      return arrays[start:stop]
+    else:
+      return [None]
+
+
+def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
+  if metric == 'accuracy' or metric == 'acc':
+    # custom handling of accuracy
+    # (because of class mode duality)
+    output_shape = internal_output_shapes
+    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
+      # case: binary accuracy
+      acc_fn = metrics_module.binary_accuracy
+    elif loss_func == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      acc_fn = metrics_module.sparse_categorical_accuracy
+    else:
+      acc_fn = metrics_module.categorical_accuracy
+
+    metric_name = 'acc'
+    return metric_name, acc_fn
+  else:
+    metric_fn = metrics_module.get(metric)
+    metric_name = metric_fn.__name__
+    return metric_name, metric_fn
+
+
+def _eager_loss_fn(outputs, targets, loss_fn, output_name):
+  with K.name_scope(output_name + '_loss'):
+    loss = loss_fn(targets, outputs)
+  return loss
+
+
+def _eager_metrics_fn(model, outputs, targets):
+  """Calculates the metrics for each output of the given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      outputs: The outputs of the given model.
+      targets: The predictions or targets of the given model.
+
+  Returns:
+      Returns the metric names and metric results for each output of the model.
+  """
+  metric_names = []
+  metric_results = []
+  if not isinstance(outputs, list):
+    outputs = [outputs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  for i in range(len(model.outputs)):
+    output_metrics = model.nested_metrics[i]
+    for nested_output_metric in output_metrics:
+      metric_name, metric_fn = _get_metrics_info(
+          nested_output_metric, model._internal_output_shapes[i],
+          model.loss_functions[i])
+
+      if len(model.output_names) > 1:
+        metric_name = model.output_names[i] + '_' + metric_name
+        if metric_name not in model.metrics_names:
+          model.metrics_names.append(metric_name)
+
+      with K.name_scope(metric_name):
+        metric_result = metric_fn(outputs[i], targets[i])
+        metric_names.append(metric_name)
+        metric_results.append(K.mean(metric_result))
+
+  return metric_names, metric_results
+
+
+def _model_loss(model, inputs, targets):
+  """Calculates the loss for a given model.
+
+  Arguments:
+     model: The model on which metrics are being calculated.
+     inputs: The inputs of the given model. This is typically the mini batch of
+              data that is fed to the model.
+     targets: The predictions or targets of the given model.
+
+  Returns:
+     Returns the model output, total loss and loss value calculated using the
+     specified loss function. The total loss includes regularization losses and
+     applies masking and sample weighting to the loss value.
+  """
+  total_loss = 0
+  outs = model(inputs)
+  if not isinstance(outs, list):
+    outs = [outs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  loss_metrics = []
+  with K.name_scope('loss'):
+    for i, loss_fn in enumerate(model.loss_functions):
+      # compute the loss
+      output_loss = _eager_loss_fn(outs[i], targets[i], loss_fn,
+                                   model.output_names[i])
+      loss_metrics.append(K.mean(output_loss))
+
+      mask = outs[i]._keras_mask
+      # adapted from weighted_loss_fn
+      if mask is not None:
+        # mask should have the same shape as output_loss
+        output_loss *= mask
+        #  the loss per batch should be proportional
+        #  to the number of unmasked samples.
+        output_loss /= K.mean(mask)
+
+      # adapted from weighted_loss_fn
+      # apply sample weighting
+      if model.sample_weights:
+        # reduce score_array to same ndim as weight array
+        ndim = K.ndim(output_loss)
+        weight_ndim = K.ndim(model.sample_weights)
+        output_loss = K.mean(output_loss, axis=list(range(weight_ndim, ndim)))
+        output_loss *= model.sample_weights
+        output_loss /= K.mean(K.cast(K.not_equal(model.sample_weights, 0),
+                                     K.floatx()))
+        output_loss = K.mean(output_loss)
+
+      loss_weight = model.loss_weights_list[i]
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+
+    total_loss = K.mean(total_loss)
+    # Add regularization losses
+    custom_losses = []
+    for layer in model.layers:
+      if layer.losses:
+        custom_losses += layer.losses
+
+    if custom_losses:
+      total_loss += sum(custom_losses)
+
+  return outs, total_loss, loss_metrics
+
+
+def _process_single_batch(eager_model_inputs, eager_model_outputs, model,
+                          training=True):
+  """Calculate the loss and gradient for one input batch.
+
+     The model weights are updated if training is set to True.
+
+  Arguments:
+      eager_model_inputs: Input batch data.
+      eager_model_outputs: Output batch data.
+      model: Model whose loss has to be calculated.
+      training: The boolean represents if the weights of the model are updated.
+              'fit' methods will set this to True while 'evaluate' methods will
+              set this to False.
+
+  Returns:
+      output of the model, total loss and the loss associated with each output.
+
+  Raises:
+      ValueError: If the model loss is 0 or if the trainable weights list is
+                  empty when the trainable parameter is set to True.
+  """
+  K.set_learning_phase(training)
+  with GradientTape() as tape:
+    outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                           eager_model_outputs)
+    if loss is None:
+      raise ValueError('The model cannot be run '
+                       'because it has no loss to optimize.')
+  if training:
+    if not model._collected_trainable_weights:
+      raise ValueError('The list of trainable weights is empty. Make sure that '
+                       'you are not setting model.trainable to False before '
+                       'compiling the model.')
+    grads = tape.gradient(loss, model._collected_trainable_weights)
+    model.optimizer.apply_gradients(zip(grads,
+                                        model._collected_trainable_weights))
+  return outs, loss, loss_metrics
+
+
+def train_on_batch(model, ins):
+  """Calculates the loss and gradient updates for one input batch.
+
+  Arguments:
+      model: Given model on which loss and gradients are calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss and the loss associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, _ = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model)
+  if not isinstance(outs, list):
+    outs = [outs]
+  _, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + metrics_results
+
+
+def test_on_batch(model, ins):
+  """Calculates the loss for one input batch.
+
+  Arguments:
+      model: Given model on which loss is calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss, loss and metrics associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, loss_metrics = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model, training=False)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metric_names, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  model.metrics_names.append(metric_names)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + loss_metrics + metrics_results
+
+
+def fit_loop(
+    model,
+    ins,
+    out_labels=None,
+    batch_size=None,
+    epochs=100,
+    verbose=1,
+    callbacks=None,
+    val_ins=None,
+    shuffle=True,
+    callback_metrics=None,
+    initial_epoch=0,
+    steps_per_epoch=None,
+    validation_steps=None):
+  """Abstract fit function for `f(ins)`.
+
+  Assume that f returns a list, labeled by out_labels.
+
+  Arguments:
+      model: Instance of the model that is being executed in Eager mode.
+      ins: List of tensors to be fed to `f`
+      out_labels: List of strings, display names of
+          the outputs of `f`
+      batch_size: Integer batch size or None if unknown.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      val_ins: List of tensors to be fed to `val_f`
+      shuffle: Whether to shuffle the data at the beginning of each epoch
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training
+          (useful for resuming a previous training run)
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch. Ignored with the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+
+  Returns:
+      `History` object.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  # Required for Eager mode
+  K.set_learning_phase(True)
+
+  do_validation = False
+  if val_ins:
+    do_validation = True
+    if (verbose and ins and hasattr(ins[0], 'shape') and
+        hasattr(val_ins[0], 'shape')):
+      print('Train on %d samples, validate on %d samples' %
+            (ins[0].shape[0], val_ins[0].shape[0]))
+  if validation_steps:
+    if steps_per_epoch is None:
+      raise ValueError('Can only use `validation_steps` when doing step-wise '
+                       'training, i.e. `steps_per_epoch` must be set.')
+    do_validation = True
+
+  num_train_samples = model._check_num_samples(
+      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
+
+  if num_train_samples is not None:
+    index_array = np.arange(num_train_samples)
+
+  model.history = cbks.History()
+  callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+  if verbose:
+    if steps_per_epoch is not None:
+      count_mode = 'steps'
+    else:
+      count_mode = 'samples'
+    callbacks += [cbks.ProgbarLogger(count_mode)]
+  callbacks = cbks.CallbackList(callbacks)
+  out_labels = out_labels or []
+
+  # it's possible to callback a different model than self
+  # (used by Sequential models)
+  if hasattr(model, 'callback_model') and model.callback_model:
+    callback_model = model.callback_model
+  else:
+    callback_model = model
+
+  callbacks.set_model(callback_model)
+
+  callbacks.set_params({
+      'batch_size': batch_size,
+      'epochs': epochs,
+      'steps': steps_per_epoch,
+      'samples': num_train_samples,
+      'verbose': verbose,
+      'do_validation': do_validation,
+      'metrics': callback_metrics or [],
+  })
+  callbacks.on_train_begin()
+  callback_model.stop_training = False
+  for cbk in callbacks:
+    cbk.validation_data = val_ins
+
+  for epoch in range(initial_epoch, epochs):
+    callbacks.on_epoch_begin(epoch)
+    epoch_logs = {}
+    if shuffle == 'batch':
+      index_array = model._batch_shuffle(index_array, batch_size)
+    elif shuffle:
+      np.random.shuffle(index_array)
+
+    batches = _make_batches(num_train_samples, batch_size)
+
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      try:
+        if isinstance(ins[-1], float):
+          # Do not slice the training phase flag.
+          ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+        else:
+          ins_batch = _slice_arrays(ins, batch_ids)
+      except TypeError:
+        raise TypeError('TypeError while preparing batch. '
+                        'If using HDF5 input data, '
+                        'pass shuffle="batch".')
+      batch_logs = {}
+      batch_logs['batch'] = batch_index
+      batch_logs['size'] = len(batch_ids)
+
+      callbacks.on_batch_begin(batch_index, batch_logs)
+
+      ins_batch_converted = []
+      for ib in ins_batch:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+      eager_model_inputs = []
+      eager_model_outputs = []
+      for i in range(len(model.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      for i in range(len(model.inputs), len(ins_batch_converted)):
+        eager_model_outputs.append(ins_batch_converted[i])
+
+      outs, loss, loss_metrics = _process_single_batch(eager_model_inputs,
+                                                       eager_model_outputs,
+                                                       model)
+
+      if not isinstance(outs, list):
+        outs = [outs]
+
+      for l, o in zip(out_labels, outs):
+        batch_logs[l] = o
+      # Required for Eager mode
+      metrics_names, metrics_results = _eager_metrics_fn(model, outs,
+                                                         eager_model_outputs)
+      batch_logs['loss'] = tensor_util.constant_value(K.mean(loss))
+
+      # TODO(anjalisridhar): Move this to compile to avoid duplicate code.
+      # In graph mode we set the metric names in compile. However in
+      # Eager mode we calculate the metrics for each batch in fit_loop.
+      # We could calculate the metric names and functions in compile.
+      # This would avoid setting the callback parameters separately.
+      # We need to do this for the first iteration alone
+      for m in metrics_names:
+        if m not in callback_metrics:
+          callback_metrics.append(m)
+
+      callbacks.set_params({
+          'batch_size': batch_size,
+          'epochs': epochs,
+          'steps': steps_per_epoch,
+          'samples': num_train_samples,
+          'verbose': verbose,
+          'do_validation': do_validation,
+          'metrics': callback_metrics or [],
+      })
+
+      for k, v in zip(model.metrics_names,
+                      [K.mean(loss)] + loss_metrics + metrics_results):
+        batch_logs[k] = tensor_util.constant_value(v)
+
+      callbacks.on_batch_end(batch_index, batch_logs)
+      if callback_model.stop_training:
+        break
+
+      if batch_index == len(batches) - 1:  # Last batch.
+        if do_validation:
+          val_outs = test_loop(
+              model, val_ins, batch_size=batch_size, verbose=0)
+          if not isinstance(val_outs, list):
+            val_outs = [val_outs]
+          # Same labels assumed.
+          for l, o in zip(out_labels, val_outs):
+            epoch_logs['val_' + l] = o
+    callbacks.on_epoch_end(epoch, epoch_logs)
+    if callback_model.stop_training:
+      break
+  callbacks.on_train_end()
+  return model.history
+
+
+def test_loop(model, ins, batch_size=None, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size or `None`.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring predictions finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  outs = []
+  if verbose == 1:
+    progbar = Progbar(target=num_samples)
+  batches = _make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = _slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    eager_model_outputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    for i in range(len(model.inputs), len(ins_batch_converted)):
+      eager_model_outputs.append(ins_batch_converted[i])
+
+    loss_outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                                eager_model_outputs)
+    _, metrics_results = _eager_metrics_fn(model, loss_outs,
+                                           eager_model_outputs)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [K.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    if isinstance(batch_outs, list):
+      if batch_index == 0:
+        for batch_out in enumerate(batch_outs):
+          outs.append(0.)
+      for i, batch_out in enumerate(batch_outs):
+        outs[i] += batch_out * len(batch_ids)
+    else:
+      if batch_index == 0:
+        outs.append(0.)
+      outs[0] += batch_outs * len(batch_ids)
+
+    if verbose == 1:
+      progbar.update(batch_end)
+  for i in range(len(outs)):
+    outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def predict_loop(model, ins, batch_size=32, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model:
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions
+      (if the model has multiple outputs).
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  if verbose == 1:
+    if steps is not None:
+      progbar = Progbar(target=steps)
+    else:
+      progbar = Progbar(target=num_samples)
+
+  outs = []
+  batches = _make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if ins and isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = _slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    batch_outs = model(eager_model_inputs)
+
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if batch_index == 0:
+      # Pre-allocate the results arrays.
+      for batch_out in batch_outs:
+        dims = batch_out.shape[1:].dims
+        dims_list = [d.value for d in dims]
+        shape = (num_samples,) + tuple(dims_list)
+        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
+    for i, batch_out in enumerate(batch_outs):
+      outs[i][batch_start:batch_end] = batch_out
+    if verbose == 1:
+      progbar.update(batch_end)
+  if len(outs) == 1:
+    return outs[0]
+  return outs
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
new file mode 100644
index 00000000000..81e2f7a5145
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -0,0 +1,755 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TrainingTest(test.TestCase):
+
+  def test_fit_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test fit at different verbosity
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+
+    # Test with validation data
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+
+  # Test with validation split
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=0,
+        validation_split=0.2)
+
+    # Test with dictionary inputs
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        validation_data=({'input_a': input_a_np,
+                          'input_b': input_b_np
+                         },
+                         {
+                             'dense': output_d_np,
+                             'dropout': output_e_np
+                         }),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.train_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    }, {'dense': output_d_np,
+        'dropout': output_e_np})
+    # Test with lists for loss, metrics
+    loss = ['mae', 'mse']
+    metrics = ['acc', 'mae']
+    model.compile(optimizer, loss, metrics=metrics)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Test with dictionaries for loss, metrics, loss weights
+    loss = {'dense': 'mse', 'dropout': 'mae'}
+    loss_weights = {'dense': 1., 'dropout': 0.5}
+    metrics = {'dense': 'mse', 'dropout': 'mae'}
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Invalid use cases
+    with self.assertRaises(AttributeError):
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          validation_data=([input_a_np, input_b_np], 0, 0),
+          verbose=0)
+    with self.assertRaises(ValueError):
+      model.train_on_batch({'input_a': input_a_np},
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch([input_a_np], [output_d_np, output_e_np])
+    with self.assertRaises(AttributeError):
+      model.train_on_batch(1, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch(input_a_np, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_input = np.random.random((11, 3))
+      model.train_on_batch([bad_input, input_b_np],
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_target = np.random.random((11, 4))
+      model.train_on_batch([input_a_np, input_b_np],
+                           [bad_target, output_e_np])
+
+    # Build single-input model
+    x = keras.layers.Input(shape=(3,), name='input_a')
+    y = keras.layers.Dense(4)(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    # This will work
+    model.fit([input_a_np], output_d_np, epochs=1)
+    with self.assertRaises(ValueError):
+      model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
+
+  def test_evaluate_predict_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test evaluate at different verbosity
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=0)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=1)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=2)
+    self.assertEqual(len(out), 5)
+    out = model.test_on_batch([input_a_np, input_b_np],
+                              [output_d_np, output_e_np])
+    self.assertEqual(len(out), 5)
+
+    # Test evaluate with dictionary inputs
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=0)
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=1)
+
+    # Test predict
+    out = model.predict([input_a_np, input_b_np], batch_size=5)
+    self.assertEqual(len(out), 2)
+    out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
+    self.assertEqual(len(out), 2)
+    out = model.predict_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    })
+    self.assertEqual(len(out), 2)
+
+  def test_invalid_loss_or_metrics(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    np.random.seed(1337)
+
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+
+    with self.assertRaises(TypeError):
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=set(0))
+
+    with self.assertRaises(ValueError):
+      model.compile(loss=None,
+                    optimizer='rms')
+
+
+class LossWeightingTest(test.TestCase):
+
+  def test_class_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        class_weight=class_weight,
+        validation_data=(x_train, y_train, sample_weight))
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight,
+        validation_split=0.1)
+
+    model.train_on_batch(
+        x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
+    ref_score = model.evaluate(x_test, y_test, verbose=0)
+    score = model.evaluate(
+        x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+    self.assertLess(score, ref_score)
+
+  def test_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(43)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight,
+        validation_split=0.1)
+    model.train_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+    model.test_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+
+  def test_temporal_sample_weights(self):
+    num_classes = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+
+    np.random.seed(1337)
+    (_, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode='temporal')
+
+  def test_class_weight_invalid_use_case(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+
+    del class_weight[1]
+    with self.assertRaises(ValueError):
+      model.fit(x_train, y_train,
+                epochs=0, verbose=0, class_weight=class_weight)
+
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode=[])
+
+    # Build multi-output model
+    x = keras.Input((3,))
+    y1 = keras.layers.Dense(4, name='1')(x)
+    y2 = keras.layers.Dense(4, name='2')(x)
+    model = keras.models.Model(x, [y1, y2])
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    x_np = np.random.random((10, 3))
+    y_np = np.random.random((10, 4))
+    w_np = np.random.random((10,))
+    # This will work
+    model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': w_np})
+    # These will not
+    with self.assertRaises(ValueError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=[w_np])
+    with self.assertRaises(TypeError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=w_np)
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((11,))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+
+
+class TestDynamicTrainability(test.TestCase):
+
+  def test_trainable_warning(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3))
+    model.trainable = False
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    model.trainable = True
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+
+  def test_trainable_argument(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+    # test with nesting
+    inputs = keras.layers.Input(shape=(3,))
+    output = model(inputs)
+    model = keras.models.Model(inputs, output)
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+  def test_layer_trainability_switch(self):
+    # with constructor argument, in Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Sequential
+    model = keras.models.Sequential()
+    layer = keras.layers.Dense(2, input_dim=1)
+    model.add(layer)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # with constructor argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2, trainable=False)(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    layer = keras.layers.Dense(2)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_model_trainability_switch(self):
+    # a non-trainable model has no trainable weights
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    model = keras.models.Model(x, y)
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # same for Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=1))
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_nested_model_trainability(self):
+
+    # a Sequential inside a Model
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Sequential inside a Sequential
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Sequential
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+
+class TestTrainingUtils(test.TestCase):
+
+  def test_check_array_lengths(self):
+    keras.engine.training._check_array_lengths(None, None, None)
+    a_np = np.random.random((4, 3, 3))
+    keras.engine.training._check_array_lengths(a_np, a_np, a_np)
+    keras.engine.training._check_array_lengths(
+        [a_np, a_np], [a_np, a_np], [a_np, a_np])
+    keras.engine.training._check_array_lengths([None], [None], [None])
+
+    b_np = np.random.random((3, 4))
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, None, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, a_np, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [None], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [b_np], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], None, [b_np])
+
+  def test_slice_arrays(self):
+    input_a = np.random.random((10, 3))
+    keras.engine.training._slice_arrays(None)
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = [None, [1, 1], None, [1, 1]]
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = [None]
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+    input_a = None
+    keras.engine.training._slice_arrays(input_a, 0)
+    keras.engine.training._slice_arrays(input_a, 0, 1)
+    keras.engine.training._slice_arrays(input_a, stop=2)
+
+  def test_fit_with_BatchNorm(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_dim=4))
+    model.add(keras.layers.BatchNormalization())
+    model.add(keras.layers.Activation('tanh'))
+    model.add(keras.layers.Dropout(0.2))
+
+    input_a_np = np.random.random((10, 4))
+    output_b_np = np.random.random((10, 10))
+
+    model.compile(loss='binary_crossentropy', optimizer=RMSPropOptimizer(0.001))
+    model.fit(input_a_np, output_b_np, epochs=1, batch_size=5, verbose=0)
+
+  def test_fit_with_regularization(self):
+    model = keras.models.Sequential()
+    with self.assertRaises(ValueError):
+      model.add(
+          keras.layers.Dense(4, input_dim=3,
+                             kernel_regularizer=keras.regularizers.l2(0.01),
+                             activity_regularizer=keras.regularizers.l1(0.01)))
+
+
+if __name__ == '__main__':
+  # Bazel sets these environment variables to very long paths.
+  # Tempfile uses them to create long paths, and in turn multiprocessing
+  # library tries to create sockets named after paths. Delete whatever bazel
+  # writes to these to avoid tests failing due to socket addresses being too
+  # long.
+  for var in ('TMPDIR', 'TMP', 'TEMP'):
+    if var in os.environ:
+      del os.environ[var]
+
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 5a033a04ade..b380238e4e2 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -78,6 +78,14 @@ class TrainingTest(test.TestCase):
           verbose=2)
       model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
+      # Test model with input data as a list of lists
+      model.fit(
+          [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)],
+          [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+
       # Test with validation data
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
@@ -205,6 +213,16 @@ class TrainingTest(test.TestCase):
       with self.assertRaises(ValueError):
         model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
+      # Test model on a list of floats
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 4))
+
+      model.fit([np.ndarray.tolist(input_a_np)],
+                [np.ndarray.tolist(input_b_np)],
+                epochs=2,
+                batch_size=5,
+                verbose=2)
+
   def test_evaluate_predict_on_arrays(self):
     with self.test_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 6ee3fb48b2f..ea2d3f2f04a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -23,6 +23,7 @@ import types as python_types
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -119,7 +120,8 @@ class Dropout(tf_core_layers.Dropout, Layer):
     if training is None:
       training = K.learning_phase()
     output = super(Dropout, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    # EagerTensor object has no attribute _uses_learning_phase
+    if not context.in_eager_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 965ef70e6e6..eecb14ceaa3 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
@@ -108,7 +109,7 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
     if training is None:
       training = K.learning_phase()
     output = super(BatchNormalization, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    if context.in_graph_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index e47987aadc4..a55a5e39a69 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -24,6 +24,7 @@ import copy
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
@@ -680,7 +681,14 @@ class TFOptimizer(Optimizer):
   def __init__(self, optimizer):  # pylint: disable=super-init-not-called
     self.optimizer = optimizer
     with K.name_scope(self.__class__.__name__):
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
+      if context.in_graph_mode():
+        self.iterations = K.variable(0, dtype='int64', name='iterations')
+
+  def apply_gradients(self, grads):
+    self.optimizer.apply_gradients(grads)
+
+  def get_grads(self, loss, params):
+    return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
     grads = self.optimizer.compute_gradients(loss, params)
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 8c1d16c2a8f..d4ceb2e489c 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1043,6 +1043,7 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -1293,7 +1294,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    # TOOD(b/70473603): change this back to "small" once the C API is
+    # TODO(b/70473603): change this back to "small" once the C API is
     # permanently enabled
     size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
@@ -1600,6 +1601,19 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "manip_ops_test",
+    size = "small",
+    srcs = ["manip_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:manip_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
 cuda_py_test(
     name = "matmul_op_test",
     size = "small",
@@ -2821,7 +2835,7 @@ tf_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
-    shard_count = 3,
+    shard_count = 10,
     tags = ["no_windows_gpu"],
 )
 
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index a96b88d96ff..7cff3e227c7 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -952,6 +952,32 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         v = variables.Variable([1, 2])
         sess.run(v[:].assign([1, 2]))
 
+  def testTypeError(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = variables.Variable(init_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_small_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_large_val)
+
+  def testTypeErrorResource(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = resource_variable_ops.ResourceVariable(init_val)
+    with self.test_session() as sess:
+      sess.run(v.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int64"):
+        sess.run(v[:].assign(too_large_val))
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int8"):
+        sess.run(v[:].assign(too_small_val))
+
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
@@ -1114,6 +1140,26 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.get_shape(), [5])
         self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
+class UnravelIndexTest(test_util.TensorFlowTestCase):
+
+  def testUnravelIndex(self):
+    with self.test_session():
+      for dtype in [dtypes.int32, dtypes.int64]:
+        indices_1 = constant_op.constant(1621, dtype=dtype)
+        dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_1 = array_ops.unravel_index(indices_1, dims_1)
+        self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+
+        indices_2 = constant_op.constant([1621], dtype=dtype)
+        dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_2 = array_ops.unravel_index(indices_2, dims_2)
+        self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
+
+        indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
+        dims_3 = constant_op.constant([7, 6], dtype=dtype)
+        out_3 = array_ops.unravel_index(indices_3, dims_3)
+        self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
+
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 04248fb2bab..2d1b3d9b7e8 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -81,6 +81,7 @@ class AtrousConvolutionTest(test.TestCase):
       otherwise, it's delayed after the context.
     """
     checks = []
+
     def add_check(check, *args, **kwargs):
       if context.in_eager_mode():
         args_val, kwargs_val = self.evaluate([args, kwargs])
@@ -96,12 +97,12 @@ class AtrousConvolutionTest(test.TestCase):
 
   def _test_atrous_convolution(self, add_check, input_shape, filter_shape,
                                dilation_rate, **kwargs):
-    filters = np.arange(np.prod(filter_shape),
-                        dtype=np.float32).reshape(filter_shape)
+    filters = np.arange(
+        np.prod(filter_shape), dtype=np.float32).reshape(filter_shape)
     filters_upsampled = upsample_filters(filters, dilation_rate)
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
-    y1 = nn_ops.convolution(input=x, filter=filters,
-                            dilation_rate=dilation_rate, **kwargs)
+    y1 = nn_ops.convolution(
+        input=x, filter=filters, dilation_rate=dilation_rate, **kwargs)
     y2 = nn_ops.convolution(input=x, filter=filters_upsampled, **kwargs)
 
     def check(y1_eval, y2_eval):
@@ -112,13 +113,15 @@ class AtrousConvolutionTest(test.TestCase):
   def test_unknown_spatial_dims_for_channel_last_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, None, None, 10])
     w = array_ops.zeros([3, 3, 10, 20])
-    y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
     self.assertEqual(y.shape.as_list(), [1, None, None, 20])
 
   def test_unknown_spatial_dims_for_channel_first_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, 10, None, None])
     w = array_ops.zeros([3, 3, 10, 20])
-    y = nn_ops.convolution(x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW")
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW")
     self.assertEqual(y.shape.as_list(), [1, 20, None, None])
 
   @test_util.run_in_graph_and_eager_modes()
@@ -215,28 +218,35 @@ class AtrousConvolutionTest(test.TestCase):
 
               def combined_op(converted_input, num_spatial_dims, padding_arg):  # pylint: disable=unused-argument
                 # pylint: disable=cell-var-from-loop
-                result = nn_ops.convolution(input=converted_input, filter=f1,
-                                            padding=padding)
-                result = nn_ops.convolution(input=result, filter=f2,
-                                            padding=padding)
+                result = nn_ops.convolution(
+                    input=converted_input, filter=f1, padding=padding)
+                result = nn_ops.convolution(
+                    input=result, filter=f2, padding=padding)
                 # pylint: enable=cell-var-from-loop
                 return result
 
               for rate_height in range(2, 4):
                 for rate_width in range(2, 4):
                   dilation_rate = [rate_height, rate_width]
-                  y1 = nn_ops.convolution(input=x, filter=f1, padding=padding,
-                                          dilation_rate=dilation_rate)
-                  y1 = nn_ops.convolution(input=y1, filter=f2,
-                                          padding=padding,
-                                          dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=x,
+                      filter=f1,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=y1,
+                      filter=f2,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
                   y2 = nn_ops.with_space_to_batch(
-                      input=x, dilation_rate=dilation_rate, op=combined_op,
+                      input=x,
+                      dilation_rate=dilation_rate,
+                      op=combined_op,
                       padding="VALID")
 
                   def check(y1_eval, y2_eval):
-                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2,
-                                        atol=1e-2)
+                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2, atol=1e-2)
+
                   add_check(check, y1, y2)
 
   def _test_gradient(self, x_shape, f_shape, dilation_rate, padding):
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index 88b3f20469a..28b3dc45e9c 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -80,7 +80,7 @@ class RangeSamplerOpsTest(test.TestCase):
     with self.test_session():
       true_classes = constant_op.constant(
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
-      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(
+      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
       result = sampled_log_expected_count.eval()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 6e18ed132cd..4fafc36014e 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
@@ -143,7 +144,7 @@ class ControlFlowTest(test.TestCase):
 
       enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True)
       nine = constant_op.constant(9)
-      enter_nine = control_flow_ops.enter(nine, "foo_1")
+      enter_nine = gen_control_flow_ops._enter(nine, "foo_1")
       op = state_ops.assign(enter_v, enter_nine)
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
@@ -163,9 +164,9 @@ class ControlFlowTest(test.TestCase):
   def testEnterMulExit(self):
     with self.test_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
-      enter_data = control_flow_ops.enter(data, "foo_1", False)
+      enter_data = gen_control_flow_ops._enter(data, "foo_1", False)
       five = constant_op.constant(5)
-      enter_five = control_flow_ops.enter(five, "foo_1", False)
+      enter_five = gen_control_flow_ops._enter(five, "foo_1", False)
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
@@ -177,12 +178,13 @@ class ControlFlowTest(test.TestCase):
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
 
       # If is_constant=True, the shape information should be propagated.
-      enter_v_constant = control_flow_ops.enter(v, "frame1", is_constant=True)
+      enter_v_constant = gen_control_flow_ops._enter(
+          v, "frame1", is_constant=True)
       self.assertEqual(enter_v_constant.shape, [2])
 
       # Otherwise, the shape should be unknown.
-      enter_v_non_constant = control_flow_ops.enter(v, "frame2",
-                                                    is_constant=False)
+      enter_v_non_constant = gen_control_flow_ops._enter(
+          v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
   def testSwitchMergeIndexedSlices(self):
@@ -255,8 +257,8 @@ class ControlFlowTest(test.TestCase):
       false = ops.convert_to_tensor(False)
       n = constant_op.constant(10)
 
-      enter_false = control_flow_ops.enter(false, "foo_1", False)
-      enter_n = control_flow_ops.enter(n, "foo_1", False)
+      enter_false = gen_control_flow_ops._enter(false, "foo_1", False)
+      enter_n = gen_control_flow_ops._enter(n, "foo_1", False)
 
       merge_n = control_flow_ops.merge([enter_n, enter_n], name="merge_n")[0]
       switch_n = control_flow_ops.switch(merge_n, enter_false)
@@ -273,9 +275,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = control_flow_ops.enter(zero, "foo", False)
-      enter_one = control_flow_ops.enter(one, "foo", True)
-      enter_n = control_flow_ops.enter(n, "foo", True)
+      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops._enter(one, "foo", True)
+      enter_n = gen_control_flow_ops._enter(n, "foo", True)
 
       with ops.device(test.gpu_device_name()):
         merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
@@ -299,9 +301,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = control_flow_ops.enter(zero, "foo", False)
-      enter_one = control_flow_ops.enter(one, "foo", True)
-      enter_n = control_flow_ops.enter(n, "foo", True)
+      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops._enter(one, "foo", True)
+      enter_n = gen_control_flow_ops._enter(n, "foo", True)
 
       merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
 
@@ -322,8 +324,8 @@ class ControlFlowTest(test.TestCase):
   def testDifferentFrame(self):
     with self.test_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
-      enter_1 = control_flow_ops.enter(data, "foo_1", False)
-      enter_2 = control_flow_ops.enter(data, "foo_2", False)
+      enter_1 = gen_control_flow_ops._enter(data, "foo_1", False)
+      enter_2 = gen_control_flow_ops._enter(data, "foo_2", False)
       res = math_ops.add(enter_1, enter_2)
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
@@ -736,24 +738,21 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
-      self.assertAllEqual([1+3, 2+3, 3+3, 4+3, 5+3], r.eval())
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
 
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.test_session():
       r = control_flow_ops.while_loop(
-          lambda i: i < 3,
-          lambda i: i + 1,
-          [0],
-          maximum_iterations=1)
+          lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
       self.assertEqual(1, r.eval())
 
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
+
     def training_loop_with_gradient(i):
       out = control_flow_ops.while_loop(
           lambda i_, _: i_ < 3,
-          lambda i_, j: [i_ + 1, j * v],
-          [0, 1.0],
+          lambda i_, j: [i_ + 1, j * v], [0, 1.0],
           maximum_iterations=i)
       g = gradients_impl.gradients(out, v)
       with ops.control_dependencies(g):
@@ -763,8 +762,8 @@ class ControlFlowTest(test.TestCase):
     xla_context.Enter()
     # Create training loop, ensure we can call gradient() of
     # while_loop inside the training loop.
-    loop = control_flow_ops.while_loop(
-        lambda i: i < 3, training_loop_with_gradient, [0])
+    loop = control_flow_ops.while_loop(lambda i: i < 3,
+                                       training_loop_with_gradient, [0])
     xla_context.Exit()
 
     loop_execute = array_ops.identity(loop)  # Because loop is not fetchable.
@@ -774,17 +773,18 @@ class ControlFlowTest(test.TestCase):
 
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
+
     def inner_body(i, x):
       out = control_flow_ops.while_loop(
           lambda i, _: i < 3,
-          lambda i, j: [i + 1, j * v],
-          [0, x],
+          lambda i, j: [i + 1, j * v], [0, x],
           maximum_iterations=i)
       return out
 
     def create_while_loop(maximum_iterations=None):
       return control_flow_ops.while_loop(
-          lambda i, _: i < 3, inner_body, [0, 1.0],
+          lambda i, _: i < 3,
+          inner_body, [0, 1.0],
           maximum_iterations=maximum_iterations)
 
     loop_no_xla = create_while_loop(maximum_iterations=5)
@@ -819,14 +819,17 @@ class ControlFlowTest(test.TestCase):
 
     def create_while_loop():
       max_iter_holder = []
+
       def create_mi():
         max_iter_holder.append(array_ops.placeholder(dtypes.int32, shape=()))
         return 1.0
-      _ = control_flow_ops.cond(constant_op.constant(True),
-                                create_mi, create_mi)
+
+      _ = control_flow_ops.cond(
+          constant_op.constant(True), create_mi, create_mi)
 
       return control_flow_ops.while_loop(
-          lambda i, _: i < 3, lambda i, x: (i + 1, v * x), (0, 1.0),
+          lambda i, _: i < 3,
+          lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
     xla_context = control_flow_ops.XLAControlFlowContext()
@@ -849,28 +852,32 @@ class ControlFlowTest(test.TestCase):
     p = array_ops.placeholder(dtype=dtypes.int32)
 
     def mid_body_builder(iterations):
+
       def mid_body(i, x):
         r = control_flow_ops.while_loop(
             lambda *_: True,
-            lambda i, x: (i + 1, v * x),
-            (0, x),
-            maximum_iterations=iterations, name="inner")
+            lambda i, x: (i + 1, v * x), (0, x),
+            maximum_iterations=iterations,
+            name="inner")
         return (i + 1, gradients_impl.gradients(x + r[1], v)[0])
+
       return mid_body
 
     def outer_body(i, x):
       iterations = array_ops.size(p, name="iterations")
-      return (
-          i + 1,
-          x + control_flow_ops.while_loop(
-              lambda *_: True, mid_body_builder(iterations), (0, x),
-              maximum_iterations=iterations, name="mid")[1])
+      return (i + 1, x + control_flow_ops.while_loop(
+          lambda *_: True,
+          mid_body_builder(iterations), (0, x),
+          maximum_iterations=iterations,
+          name="mid")[1])
 
     def create_while_loop():
       with ops.device("/cpu:0"):
         r = control_flow_ops.while_loop(
-            lambda *_: True, outer_body, (0, 1.0),
-            maximum_iterations=5, name="outer")
+            lambda *_: True,
+            outer_body, (0, 1.0),
+            maximum_iterations=5,
+            name="outer")
         return array_ops.identity(r[1])
 
     xla_context = control_flow_ops.XLAControlFlowContext()
@@ -881,18 +888,19 @@ class ControlFlowTest(test.TestCase):
     final_without_xla_context = create_while_loop()
 
     with self.test_session(use_gpu=False) as sess:
-      opts = config_pb2.RunOptions(
-          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
 
       final_value_without_xla_context = sess.run(
-          final_without_xla_context,
-          feed_dict={p: [0, 0, 0]})
+          final_without_xla_context, feed_dict={
+              p: [0, 0, 0]
+          })
 
       final_value_with_xla_context = sess.run(
           final_with_xla_context,
           feed_dict={p: [0, 0, 0]},
-          options=opts, run_metadata=run_metadata)
+          options=opts,
+          run_metadata=run_metadata)
 
       node_stats = run_metadata.step_stats.dev_stats[0].node_stats
       stack_push_count = len(
@@ -901,8 +909,8 @@ class ControlFlowTest(test.TestCase):
       # the last two "3"s comes from size(p), when p == [0, 0, 0].
       self.assertEqual(stack_push_count, 5 * 3 * 3)
 
-      self.assertAllClose(
-          final_value_with_xla_context, final_value_without_xla_context)
+      self.assertAllClose(final_value_with_xla_context,
+                          final_value_without_xla_context)
 
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
@@ -951,8 +959,7 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
 
       def compute(i, c, o):
-        c = array_ops.strided_slice(x,
-                                    array_ops.expand_dims(i, 0),
+        c = array_ops.strided_slice(x, array_ops.expand_dims(i, 0),
                                     [1] + array_ops.expand_dims(i, 0))
         o = array_ops.concat([o, c], 0)
         i = math_ops.add(i, 1)
@@ -963,11 +970,12 @@ class ControlFlowTest(test.TestCase):
       o = ops.convert_to_tensor([0])
       x = ops.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = array_ops.size(x)
-      r = control_flow_ops.while_loop(
-          lambda i, c, o: math_ops.less(i, s), compute, [i, c, o], [
-              i.get_shape(), tensor_shape.unknown_shape(),
-              tensor_shape.unknown_shape()
-          ])
+      r = control_flow_ops.while_loop(lambda i, c, o: math_ops.less(i, s),
+                                      compute, [i, c, o], [
+                                          i.get_shape(),
+                                          tensor_shape.unknown_shape(),
+                                          tensor_shape.unknown_shape()
+                                      ])
       result = r[2].eval()
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
@@ -1033,7 +1041,8 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, _b, [i, m], [i.get_shape(), tensor_shape.unknown_shape()])
+          c, _b, [i, m],
+          [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
       self.assertAllEqual(np.ones((8, 8)), r.eval())
 
@@ -1065,7 +1074,8 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, b, [i, m], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, m],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertTrue(r[1].get_shape()[0].value is None)
       self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
 
@@ -1092,20 +1102,22 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 1)
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None])])
       self.assertTrue(r.dense_shape.get_shape()[0].value is None)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([5])])
 
   def testWhileShapeInferenceIndexedSlices(self):
     with self.test_session():
@@ -1120,7 +1132,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -1128,14 +1141,16 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r.values.get_shape(), tensor_shape.TensorShape([2, 2]))
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
       self.assertTrue(r.values.get_shape()[0].value is None)
       self.assertEqual(r.values.get_shape()[1].value, 2)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
   def _testNestedWhile_1(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
@@ -1276,16 +1291,17 @@ class ControlFlowTest(test.TestCase):
           "v", [], initializer=init_ops.constant_initializer(2))
       i0 = constant_op.constant(0)
       with ops.control_dependencies([i0]):
+
         def loop_condition(i):
           return i < 4
 
         def loop_body(i):
           some_cond = control_flow_ops.cond(
               constant_op.constant(True),
-              lambda: state_ops.assign(v, math_ops.square(v)),
-              lambda: v)
+              lambda: state_ops.assign(v, math_ops.square(v)), lambda: v)
           with ops.control_dependencies([some_cond]):
             return i + 1
+
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
       self.assertEqual(4, r.eval())
@@ -1600,7 +1616,8 @@ class ControlFlowTest(test.TestCase):
 
       _, rx = control_flow_ops.while_loop(
           c1,
-          b1, [r, x], [r.get_shape(), tensor_shape.unknown_shape()],
+          b1, [r, x],
+          [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
       self.assertEqual(45, rx.eval())
 
@@ -1663,7 +1680,8 @@ class ControlFlowTest(test.TestCase):
       b = lambda i, v: [i + 1, math_ops.multiply(x, v)]
       r = control_flow_ops.while_loop(
           c,
-          b, [n, v], [n.get_shape(), tensor_shape.unknown_shape()],
+          b, [n, v],
+          [n.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
 
       r = gradients_impl.gradients(r[1], x)[0]
@@ -1797,8 +1815,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -1824,8 +1842,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -2176,7 +2194,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2197,8 +2216,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2220,8 +2239,8 @@ class ControlFlowTest(test.TestCase):
         x1 = x + gradients_impl.gradients(data, params)[0]
         return i + 1, x1
 
-      output_grad = control_flow_ops.while_loop(c, b,
-                                                [i0, constant_op.constant(0.0)])
+      output_grad = control_flow_ops.while_loop(
+          c, b, [i0, constant_op.constant(0.0)])
       self.assertAllClose(600.0, sess.run(output_grad)[1])
 
   def testWhileAndTensorArray(self):
@@ -2359,9 +2378,12 @@ class ControlFlowTest(test.TestCase):
 
   def testStopGradMultiFlows(self):
     with self.test_session():
+
       def body(i, y, r):
         x = variable_scope.get_variable(
-            "x", shape=(), dtype=dtypes.float32,
+            "x",
+            shape=(),
+            dtype=dtypes.float32,
             initializer=init_ops.ones_initializer())
         y *= x
         return [i + 1, y, r + math_ops.reduce_sum(y)]
@@ -2773,7 +2795,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           lambda i, v: i < 2, lambda i, v: [i + 1, func(v)],
           [constant_op.constant(0), x],
-          [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
+          [tensor_shape.unknown_shape(),
+           tensor_shape.unknown_shape()])
       self.assertEqual(r[1].eval(), 65536.0)
 
       r = gradients_impl.gradients(r, x)[0]
@@ -2800,12 +2823,14 @@ class ControlFlowContextCheckTest(test.TestCase):
 
   def _getCondTensor(self):
     cond_tensor = []
+
     def true_fn():
       if not cond_tensor:
         cond_tensor.append(constant_op.constant(1))
       return cond_tensor[0]
-    control_flow_ops.cond(math_ops.less(1, 2), true_fn,
-                          lambda: constant_op.constant(0))
+
+    control_flow_ops.cond(
+        math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
     return cond_tensor[0]
 
   def testInvalidContext(self):
@@ -2821,14 +2846,13 @@ class ControlFlowContextCheckTest(test.TestCase):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
     with self.assertRaisesRegexp(
-        ValueError,
-        "Cannot use 'while/Const_1' as input to 'cond/Add' because "
+        ValueError, "Cannot use 'while/Const_1' as input to 'cond/Add' because "
         "'while/Const_1' is in a while loop. See info log for more details."):
       # TODO(skyewm): this passes if we return while_tensor directly instead
       # of using it as input to another op.
-      control_flow_ops.cond(math_ops.less(1, 2),
-                            lambda: math_ops.add(1, while_tensor),
-                            lambda: constant_op.constant(0))
+      control_flow_ops.cond(
+          math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
+          lambda: constant_op.constant(0))
 
   def testInvalidContextInWhile(self):
     # Accessing a while loop tensor in a different while loop is illegal.
@@ -2856,6 +2880,7 @@ class ControlFlowContextCheckTest(test.TestCase):
     # Accessing a tensor from a cond context from the other branch's cond
     # context is OK (although dangerous).
     cond_tensor = []
+
     def branch_fn():
       if not cond_tensor:
         cond_tensor.append(constant_op.constant(1))
@@ -2892,12 +2917,13 @@ class ControlFlowContextCheckTest(test.TestCase):
       while_tensor = self._getWhileTensor()
       return control_flow_ops.while_loop(lambda i: i < 3,
                                          lambda i: i + while_tensor, [0])
+
     with self.assertRaisesRegexp(
         ValueError,
         "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
         " they are in different while loops. See info log for more details."):
-      control_flow_ops.cond(math_ops.less(1, 2), true_fn,
-                            lambda: constant_op.constant(0))
+      control_flow_ops.cond(
+          math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
 
 
 @test_util.with_c_api
@@ -3005,11 +3031,13 @@ class AssertTest(test.TestCase):
       sess.run(unguarded_assert, options=opts, run_metadata=unguarded_metadata)
       guarded_nodestat_names = [
           n.node_name
-          for d in guarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in guarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       unguarded_nodestat_names = [
           n.node_name
-          for d in unguarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in unguarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       guarded_memcpy_nodestat_names = [
           n for n in guarded_nodestat_names if "MEMCPYDtoH" in n
@@ -3066,6 +3094,7 @@ class WhileOpBenchmark(test.Benchmark):
     Returns:
       The duration of the run in seconds.
     """
+
     def loop_body(i, x):
       with ops.device("/gpu:0"):
         # Always put loop body on GPU.
@@ -3107,7 +3136,7 @@ class WhileOpBenchmark(test.Benchmark):
       start_time = time.time()
       for _ in xrange(num_iters):
         sess.run(r)
-      return (time.time() - start_time)/num_iters
+      return (time.time() - start_time) / num_iters
 
   def benchmarkWhileOpCrossDevicePlacement(self):
     iters = 10
@@ -3154,23 +3183,20 @@ class EagerTest(test.TestCase):
   def testWhileLoop(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-      self.assertAllEqual(isum(tensor).numpy(),
-                          [46, 47, 48, 49, 50])
+      self.assertAllEqual(isum(tensor).numpy(), [46, 47, 48, 49, 50])
 
   def testWhileLoopWithMaxIterations(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-      self.assertAllEqual(isum(tensor, maximum_iterations=3).numpy(),
-                          [1+3, 2+3, 3+3, 4+3, 5+3])
+      self.assertAllEqual(
+          isum(tensor, maximum_iterations=3).numpy(),
+          [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3])
 
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with context.eager_mode():
       tensor = constant_op.constant(0)
       r = control_flow_ops.while_loop(
-          lambda i: i < 3,
-          lambda i: i + 1,
-          [tensor],
-          maximum_iterations=1)
+          lambda i: i < 3, lambda i: i + 1, [tensor], maximum_iterations=1)
       self.assertEqual(1, r.numpy())
 
   def testWithDependencies(self):
@@ -3197,8 +3223,8 @@ class EagerTest(test.TestCase):
       f2 = lambda: constant_op.constant(23)
       f3 = lambda: constant_op.constant(-1)
 
-      r1 = control_flow_ops.case([(x < y, f1), (x > z, f2)],
-                                 default=f3, exclusive=True)
+      r1 = control_flow_ops.case(
+          [(x < y, f1), (x > z, f2)], default=f3, exclusive=True)
       self.assertAllEqual(r1.numpy(), 17)
 
 
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 39e96f74b04..23185eaeece 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -41,17 +41,17 @@ class ControlFlowUtilTest(test.TestCase):
     self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
 
   def testIsLoopEnter(self):
-    enter = gen_control_flow_ops.enter(1, frame_name="name").op
+    enter = gen_control_flow_ops._enter(1, frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(enter))
 
-    ref_enter = gen_control_flow_ops.ref_enter(test_ops.ref_output(),
-                                               frame_name="name").op
+    ref_enter = gen_control_flow_ops._ref_enter(test_ops.ref_output(),
+                                                frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(ref_enter))
     self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter))
 
-    const_enter = gen_control_flow_ops.enter(1, frame_name="name",
-                                             is_constant=True).op
+    const_enter = gen_control_flow_ops._enter(1, frame_name="name",
+                                              is_constant=True).op
     self.assertTrue(control_flow_util.IsLoopEnter(const_enter))
     self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter))
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 3e9bd3dade6..c5446326ba1 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -24,6 +24,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.contrib import layers
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index a91917b27fa..0d9b46c30db 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -71,6 +71,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   return sparse_tensor.SparseTensor(
       indices=x_indices, values=x_values, dense_shape=x_shape), x_values
 
+
 def _default_tolerance(dtype):
   """Returns a sensible default tolerance for comparing results of a given
   type"""
@@ -81,7 +82,7 @@ def _default_tolerance(dtype):
   elif dtype in (np.float64, np.complex128):
     return 1e-5
   else:
-    return None # Fail fast for unexpected types
+    return None  # Fail fast for unexpected types
 
 
 class UnaryOpTest(test.TestCase):
@@ -233,10 +234,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(k, np.arccos, math_ops.acos)
     self._compareBoth(x, np.arctan, math_ops.atan)
     self._compareBoth(x, np.tan, math_ops.tan)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -298,8 +299,8 @@ class UnaryOpTest(test.TestCase):
     w = x - x.min() + 1.02  # all greater than 1
     y = (x + .5).astype(np.float64)  # no zero
     z = (x + 15.5).astype(np.float64)  # all positive
-    k = np.arange(-0.90, 0.90, 0.35).reshape(1, 3, 2).astype(
-        np.float64)  # between -1 and 1
+    k = np.arange(-0.90, 0.90,
+                  0.35).reshape(1, 3, 2).astype(np.float64)  # between -1 and 1
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
@@ -322,10 +323,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
     self._compareBoth(x, np.arctan, math_ops.atan)
@@ -362,10 +363,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -406,8 +407,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(x, np.sign, math_ops.sign)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex64)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex64)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -450,8 +451,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex128)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex128)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -805,10 +806,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.mod, _MOD)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex64)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex64)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex64)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex64)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -819,10 +820,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex128)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex128)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex128)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex128)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -1127,8 +1128,8 @@ class BinaryOpTest(test.TestCase):
 
   def testMismatchedDimensions(self):
     for func in [
-        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div,
-        _ADD, _SUB, _MUL, _TRUEDIV, _FLOORDIV
+        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD,
+        _SUB, _MUL, _TRUEDIV, _FLOORDIV
     ]:
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Dimensions must" in str(e)):
@@ -1161,8 +1162,8 @@ class BinaryOpTest(test.TestCase):
                    (1.2345, float("inf")), (1.2345, -float("inf")),
                    (-4.321, float("inf")), (-4.125, -float("inf")),
                    (float("inf"), float("inf")), (float("inf"), -float("inf")),
-                   (-float("inf"), float("inf")), (-float("inf"),
-                                                   -float("inf")))
+                   (-float("inf"), float("inf")),
+                   (-float("inf"), -float("inf")))
     for dtype in np.float32, np.float64:
       x1 = np.array(x1l).astype(dtype)
       x2 = np.array(x2l).astype(dtype)
@@ -1213,22 +1214,22 @@ class ComparisonOpTest(test.TestCase):
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
-          self.assertEqual(self._compareScalar(math_ops.less_equal, x, y, t),
-                           x <= y)
-          self.assertEqual(self._compareScalar(math_ops.greater, x, y, t),
-                           x > y)
+          self.assertEqual(
+              self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
+          self.assertEqual(
+              self._compareScalar(math_ops.greater, x, y, t), x > y)
           self.assertEqual(
               self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
@@ -1311,8 +1312,8 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(np.equal, math_ops.equal, include_complex=True)
 
   def testBCastNotEqual(self):
-    self._testBCastByFunc(np.not_equal, math_ops.not_equal,
-                          include_complex=True)
+    self._testBCastByFunc(
+        np.not_equal, math_ops.not_equal, include_complex=True)
 
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
@@ -1771,9 +1772,8 @@ class MathOpsOverloadTest(test.TestCase):
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
     with self.test_session(use_gpu=False):
-      self.assertAllClose(
-          np_ans, tf_func(ops.convert_to_tensor(
-              x, dtype=dtype)).eval())
+      self.assertAllClose(np_ans,
+                          tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval())
 
   def testOverload(self):
     dtypes = [
@@ -1795,8 +1795,8 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        if dtype in (dtypes_lib.complex64, dtypes_lib.complex128
-                    ) and tf_func == _FLOORDIV:
+        if dtype in (dtypes_lib.complex64,
+                     dtypes_lib.complex128) and tf_func == _FLOORDIV:
           continue  # floordiv makes no sense for complex
         self._compareBinary(10, 5, dtype, np_func, tf_func)
     # Mod only works for int32 and int64.
@@ -2008,7 +2008,8 @@ class ComplexMakeRealImagTest(test.TestCase):
     # self._compareAngle(cplx, use_gpu=True)
 
   def testRealReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
+                  dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.real(x)
       self.assertEqual(x, y)
@@ -2037,15 +2038,16 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareConj(cplx, use_gpu=True)
 
   def testConjReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
+                  dtypes_lib.float32, dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.conj(x)
       self.assertEqual(x, y)
 
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
-    with self.assertRaisesRegexp(
-        TypeError, r"Expected numeric or variant tensor"):
+    with self.assertRaisesRegexp(TypeError,
+                                 r"Expected numeric or variant tensor"):
       math_ops.conj(x)
 
   def _compareGradient(self, x):
@@ -2060,8 +2062,9 @@ class ComplexMakeRealImagTest(test.TestCase):
       real, imag = array_ops.reshape(real, [-1]), array_ops.reshape(imag, [-1])
       cplx = math_ops.complex(real, imag)
       cplx = math_ops.conj(cplx)
-      loss = math_ops.reduce_sum(math_ops.square(math_ops.real(
-          cplx))) + math_ops.reduce_sum(math_ops.square(math_ops.imag(cplx)))
+      loss = math_ops.reduce_sum(math_ops.square(
+          math_ops.real(cplx))) + math_ops.reduce_sum(
+              math_ops.square(math_ops.imag(cplx)))
       epsilon = 1e-3
       jacob_t, jacob_n = gradient_checker.compute_gradient(
           inx, list(x.shape), loss, [1], x_init_value=x, delta=epsilon)
@@ -2125,8 +2128,8 @@ class AccumulateTest(test.TestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = math_ops.accumulate_n(random_tensors)
       np_val = random_arrays[0]
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index ead55cd03b6..89fd26c544b 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import time
 
+from six.moves import xrange
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index cf723f5eec3..a4b30e43195 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -48,8 +48,10 @@ class DynamicStitchTestBase(object):
 
   def testShapeInferenceForScalarWithNonConstantIndices(self):
     with self.test_session(use_gpu=True):
-      indices = [array_ops.placeholder(dtype=dtypes.int32),
-                 constant_op.constant(1)]
+      indices = [
+          array_ops.placeholder(dtype=dtypes.int32),
+          constant_op.constant(1)
+      ]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
@@ -61,7 +63,8 @@ class DynamicStitchTestBase(object):
   def testSimpleOneDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6, 2, 3, 5])
       ]
       data = [
           constant_op.constant([0, 40, 70]),
@@ -86,7 +89,8 @@ class DynamicStitchTestBase(object):
   def testSimpleTwoDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6]),
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6]),
           constant_op.constant([2, 3, 5])
       ]
       data = [
@@ -104,7 +108,8 @@ class DynamicStitchTestBase(object):
   def testHigherRank(self):
     with self.test_session(use_gpu=True) as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
@@ -127,7 +132,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorIndicesMultiDimensional(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([[1, 6, 2, 3, 5]])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([[1, 6, 2, 3, 5]])
     ]
     data = [
         constant_op.constant([[0, 40, 70]]),
@@ -138,7 +144,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataNumDimsMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -149,7 +156,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataDimSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 5]), constant_op.constant([1, 6, 2, 3])
+        constant_op.constant([0, 4, 5]),
+        constant_op.constant([1, 6, 2, 3])
     ]
     data = [
         constant_op.constant([[0], [40], [70]]),
@@ -160,7 +168,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataAndIndicesSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -235,13 +244,15 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
   def testHigherRankGPU(self):
     with self.test_session() as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
           constant_op.constant([61, 62], dtype=dtypes.float32),
           constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+          constant_op.constant(
+              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
       stitched_val = stitched_t.eval()
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 5c7624f1f6b..6ea9f1badc3 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -84,7 +84,7 @@ class ExtractImagePatches(test.TestCase):
           patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Valid(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with VALID padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 1, 1, 4]
@@ -98,7 +98,7 @@ class ExtractImagePatches(test.TestCase):
         patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Same(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with SAME padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 2, 2, 4]
@@ -111,6 +111,20 @@ class ExtractImagePatches(test.TestCase):
         padding="SAME",
         patches=patches)
 
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index f91875c6f0c..61944f7e319 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -1,4 +1,4 @@
-﻿# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 81af3a0887d..f1fbe1a745b 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -953,14 +953,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     # Compute the expected loss 'manually'.
     total = np.zeros((batch_size,))
     for b in range(batch_size):
-      for i in range(dims):
-        for j in range(dims):
+      for i in range(dims-1):
+        for j in range(i+1, dims):
           x = self._predictions[b, i].item() - self._predictions[b, j].item()
           y = self._labels[b, i].item() - self._labels[b, j].item()
           diff = (x - y)
           total[b] += (diff * diff)
 
-    self._expected_losses = np.divide(total, 9.0)
+    self._expected_losses = np.divide(total, 3.0)
 
   def testValueErrorThrownWhenWeightIsNone(self):
     with self.test_session():
@@ -1060,7 +1060,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
     self._test_valid_weights(
-        labels, predictions, expected_loss=122.22222)
+        labels, predictions, expected_loss=137.5)
 
   def test3dWeightedScalar(self):
     labels = np.array([
@@ -1073,7 +1073,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     weight = 3.0
     self._test_valid_weights(
-        labels, predictions, expected_loss=weight * 122.22222,
+        labels, predictions, expected_loss=weight * 137.5,
         weights=weight)
 
   def _test_invalid_weights(
@@ -1124,7 +1124,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(
         # TODO(ptucker): This doesn't look right.
-        labels, predictions, expected_loss=9 * 122.22222,
+        labels, predictions, expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
   def testLossWithAllZeroBatchSpecificWeights(self):
@@ -1345,6 +1345,34 @@ class ComputeWeightedLossTest(test.TestCase):
               self.assertAllClose(
                   np.mean(self._raw_losses), unweighted_loss.eval())
 
+  def testUnweightedFromPlaceholder(self):
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        raw_losses = array_ops.placeholder(dtype=dtypes.float32)
+        feed_dict = {raw_losses: self._raw_losses}
+        unweighted_losses = (
+            losses.compute_weighted_loss(raw_losses, reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction),
+        )
+        self.assertEqual(3, len(util.get_losses()))
+        with self.test_session(g):
+          for unweighted_loss in unweighted_losses:
+            if reduction == losses.Reduction.NONE:
+              self.assertAllClose(
+                  self._raw_losses, unweighted_loss.eval(feed_dict))
+            elif reduction == losses.Reduction.SUM:
+              self.assertAllClose(
+                  np.sum(self._raw_losses), unweighted_loss.eval(feed_dict))
+            else:
+              # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
+              # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
+              self.assertAllClose(
+                  np.mean(self._raw_losses), unweighted_loss.eval(feed_dict))
+
   def testScalarWeight(self):
     with ops.Graph().as_default():
       self.assertEqual(0, len(util.get_losses()))
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
new file mode 100644
index 00000000000..3044b21aa42
--- /dev/null
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -0,0 +1,137 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for manip_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import manip_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.platform import test as test_lib
+
+import numpy as np
+
+# pylint: disable=g-import-not-at-top
+try:
+  from distutils.version import StrictVersion as Version
+  # numpy.roll for multiple shifts was introduced in numpy version 1.12.0
+  NP_ROLL_CAN_MULTISHIFT = Version(np.version.version) >= Version('1.12.0')
+except ImportError:
+  NP_ROLL_CAN_MULTISHIFT = False
+# pylint: enable=g-import-not-at-top
+
+class RollTest(test_util.TensorFlowTestCase):
+  def _testRoll(self, np_input, shift, axis):
+    expected_roll = np.roll(np_input, shift, axis)
+    with self.test_session():
+      roll = manip_ops.roll(np_input, shift, axis)
+      self.assertAllEqual(roll.eval(), expected_roll)
+
+  def _testGradient(self, np_input, shift, axis):
+    with self.test_session():
+      inx = constant_op.constant(np_input.tolist())
+      xs = list(np_input.shape)
+      y = manip_ops.roll(inx, shift, axis)
+      # Expected y's shape to be the same
+      ys = xs
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, xs, y, ys, x_init_value=np_input)
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _testAll(self, np_input, shift, axis):
+    self._testRoll(np_input, shift, axis)
+    if np_input.dtype == np.float32:
+      self._testGradient(np_input, shift, axis)
+
+  def testIntTypes(self):
+    for t in [np.int32, np.int64]:
+      self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.randint(-100, 100, (4, 4, 3)).astype(t),
+                      [1, -2, 3], [0, 1, 2])
+        self._testAll(np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
+                      [0, 1, -2], [1, 2, 3])
+
+  def testFloatTypes(self):
+    for t in [np.float32, np.float64]:
+      self._testAll(np.random.rand(5).astype(t), 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
+        self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
+
+  def testComplexTypes(self):
+    for t in [np.complex64, np.complex128]:
+      x = np.random.rand(4, 4).astype(t)
+      self._testAll(x + 1j * x, 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        x = np.random.rand(2, 5).astype(t)
+        self._testAll(x + 1j * x, [1, 2], [1, 0])
+        x = np.random.rand(3, 2, 1, 1).astype(t)
+        self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
+
+
+  def testRollInputMustVectorHigherRaises(self):
+    tensor = 7
+    shift = 1
+    axis = 0
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "input must be 1-D or higher"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = 1
+    axis = [[0, 1]]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "axis must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = [[0, 1]]
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    tensor = [[1, 2],
+              [3, 4]]
+    shift = [1]
+    axis = [0, 1]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift and axis must have the same size"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisOutOfRangeRaises(self):
+    tensor = [1, 2]
+    shift = 1
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 317b8dc05be..68d626de2c5 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -54,9 +55,13 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
           band_np = np.tril(band_np, upper)
         if batch_shape_ is not ():
           band_np = np.tile(band_np, batch_shape_ + (1, 1))
-        with self.test_session(use_gpu=False):
-          band = array_ops.matrix_band_part(batch_mat, lower, upper)
-          self.assertAllEqual(band_np, band.eval())
+        for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]:
+          with self.test_session(use_gpu=False):
+            band = array_ops.matrix_band_part(
+                batch_mat,
+                constant_op.constant(lower, index_dtype),
+                constant_op.constant(upper, index_dtype))
+            self.assertAllEqual(band_np, band.eval())
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index 56a07cb012f..f5c6255c346 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -50,8 +50,7 @@ class PartitionerCreatorsTest(test.TestCase):
     with self.test_session():
       partitioner = partitioned_variables.fixed_size_partitioner(4, axis=0)
       with variable_scope.variable_scope("root", partitioner=partitioner):
-        v0 = variable_scope.get_variable(
-            "v0", dtype=dtypes.int64, shape=[20])
+        v0 = variable_scope.get_variable("v0", dtype=dtypes.int64, shape=[20])
         v0_list = v0._get_variable_list()
         self.assertEqual(len(v0_list), 4)
 
@@ -169,8 +168,10 @@ class PartitionerCreatorsTest(test.TestCase):
           max_shards=2)
 
       # Use the partitioner with strings
-      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(
-          axis=3, max_shard_bytes=32768, bytes_per_string_element=8)
+      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(  # pylint: disable=line-too-long
+          axis=3,
+          max_shard_bytes=32768,
+          bytes_per_string_element=8)
 
       with variable_scope.variable_scope(
           "root", partitioner=partitioner_axis3_str):
@@ -423,8 +424,7 @@ class PartitionedVariablesTestCase(test.TestCase):
   def testRandomInitUnevenPartitions(self):
     with self.test_session():
       rnd = variables.Variable(
-          random_ops.random_uniform(
-              [20, 43], dtype=dtypes.float64))
+          random_ops.random_uniform([20, 43], dtype=dtypes.float64))
       var_lists = [
           partitioned_variables.create_partitioned_variables(
               rnd.get_shape(), [1, i], rnd.initialized_value())
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 3263ed1a601..4466beeec96 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -1811,16 +1811,17 @@ class PoolingTest(test.TestCase):
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
       for pool_func in pool_funcs:
-        # Illegal strides.
-        with self.assertRaisesRegexp(
-            errors_impl.UnimplementedError,
-            "Pooling is not yet supported on the batch"):
-          sess.run(
-              pool_func(
-                  array_ops.placeholder(dtypes.float32),
-                  ksize=[1, 1, 1, 1],
-                  strides=[2, 1, 1, 1],
-                  padding="SAME"))
+        if pool_func != nn_ops.max_pool:
+          # Illegal strides.
+          with self.assertRaisesRegexp(
+              errors_impl.UnimplementedError,
+              "Pooling is not yet supported on the batch"):
+            sess.run(
+                pool_func(
+                    array_ops.placeholder(dtypes.float32),
+                    ksize=[1, 1, 1, 1],
+                    strides=[2, 1, 1, 1],
+                    padding="SAME"))
 
         # Filter too large.
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 92fb68820e0..c7181497d89 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -396,66 +396,66 @@ class PyFuncTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerSingleOutputFloat32(self):
-    a = array_ops.ones((3, 3), dtype=dtypes.float32)
-    x = array_ops.ones((3, 1), dtype=dtypes.float32)
-    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
-    with self.test_session():
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
       ret = self.evaluate(output)
       self.assertAllClose(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerArrayOutput(self):
-    a = array_ops.ones((3, 3), dtype=dtypes.int32)
-    x = array_ops.ones((3, 1), dtype=dtypes.int32)
-    output = script_ops.eager_py_func(
-        lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.int32])
-
-    with self.test_session():
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(
+          lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.float32])
       ret = self.evaluate(output)
-      self.assertAllEqual(ret, [[[3], [3], [3]]])
+      self.assertAllEqual(ret, [[[3.0], [3.0], [3.0]]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerReturnNone(self):
+    with test_util.device(use_gpu=True):
+      def no_return_value():
+        return
 
-    def no_return_value():
-      return
-
-    output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
-    ret = self.evaluate(output)
-    if context.in_eager_mode():
-      self.assertEquals(len(ret), 0)
-    else:
-      self.assertIsNone(ret)
+      output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
+      ret = self.evaluate(output)
+      if context.in_eager_mode():
+        self.assertEquals(len(ret), 0)
+      else:
+        self.assertIsNone(ret)
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerPyFuncInDefun(self):
+    with test_util.device(use_gpu=True):
+      def wrapper():
+        a = array_ops.ones((3, 3), dtype=dtypes.float32)
+        x = array_ops.ones((3, 1), dtype=dtypes.float32)
+        return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
 
-    def wrapper():
-      a = array_ops.ones((3, 3), dtype=dtypes.int32)
-      x = array_ops.ones((3, 1), dtype=dtypes.int32)
-      return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
-
-    wrapped = function.defun(wrapper)
-    ret = self.evaluate(wrapped())
-    self.assertAllEqual(ret, [[3], [3], [3]])
+      wrapped = function.defun(wrapper)
+      ret = self.evaluate(wrapped())
+      self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerExceptionHandling(self):
-    self._testExceptionHandling(
-        ValueError, errors.InvalidArgumentError, eager=True)
-    self._testExceptionHandling(
-        TypeError, errors.InvalidArgumentError, eager=True)
-    self._testExceptionHandling(
-        StopIteration, errors.OutOfRangeError, eager=True)
-    self._testExceptionHandling(
-        MemoryError, errors.ResourceExhaustedError, eager=True)
-    self._testExceptionHandling(
-        NotImplementedError, errors.UnimplementedError, eager=True)
+    with test_util.device(use_gpu=True):
+      self._testExceptionHandling(
+          ValueError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          TypeError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          StopIteration, errors.OutOfRangeError, eager=True)
+      self._testExceptionHandling(
+          MemoryError, errors.ResourceExhaustedError, eager=True)
+      self._testExceptionHandling(
+          NotImplementedError, errors.UnimplementedError, eager=True)
 
-    class WeirdError(Exception):
-      pass
+      class WeirdError(Exception):
+        pass
 
-    self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
+      self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 5a2903a4234..df37dd98ece 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -203,7 +203,8 @@ class RandomUniformTest(test.TestCase):
     return func
 
   def testRange(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       sampler = self._Sampler(1000, minv=-2, maxv=8, dtype=dt, use_gpu=True)
       x = sampler()
       self.assertTrue(-2 <= np.min(x))
@@ -213,7 +214,8 @@ class RandomUniformTest(test.TestCase):
   # to see the same sequence of values. Will catch buggy
   # implementations which uses the same random number seed.
   def testDistinct(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 1 << 30
       sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt, use_gpu=True)
       x = sampler()
@@ -251,7 +253,8 @@ class RandomUniformTest(test.TestCase):
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
   def testCPUGPUMatch(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 17
       results = {}
       for use_gpu in False, True:
@@ -261,7 +264,8 @@ class RandomUniformTest(test.TestCase):
       self.assertAllEqual(results[False], results[True])
 
   def testSeed(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       for seed in [345, 2**100, -2**100]:
         sx = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
         sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
@@ -285,8 +289,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.truncated_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32))
@@ -298,8 +301,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32))
@@ -311,8 +313,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_uniform(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_uniform(array_ops.placeholder(dtypes.int32))
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index b4b555591d0..cd945796881 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 @test_util.with_c_api
@@ -170,6 +171,17 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[3]])
 
+  def testScatterUpdateString(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.string, shape=[1, 1])
+    self.evaluate(resource_variable_ops.assign_variable_op(
+        handle, constant_op.constant([["a"]], dtype=dtypes.string)))
+    self.evaluate(resource_variable_ops.resource_scatter_update(
+        handle, [0], constant_op.constant([["b"]], dtype=dtypes.string)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.string)
+    self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
+                     compat.as_bytes("b"))
+
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0c77d1db921..a86b65affec 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -23,6 +23,7 @@ import timeit
 
 import numpy as np
 
+from six.moves import xrange
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index be72c194072..bb3f6970e4f 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -25,11 +25,13 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
@@ -174,8 +176,11 @@ class SoftmaxTest(test.TestCase):
 
   def testDimTooLarge(self):
     with self.test_session():
+      # Use placeholder to make sure we get runtime error instead of shape
+      # inference error.
+      dim = array_ops.placeholder_with_default(100, shape=[])
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax([1., 2., 3., 4.], dim=100).eval()
+        nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval()
 
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 38205518b52..8ad29afd0a0 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -56,9 +56,11 @@ class TensordotTest(test_lib.TestCase):
         axes_ph = array_ops.placeholder(dtypes.int32)
         output = math_ops.tensordot(a_ph, b_ph, axes_ph)
         _ = sess.run(
-            [output], feed_dict={a_ph: a,
-                                 b_ph: b,
-                                 axes_ph: (a_axes, b_axes)})
+            [output], feed_dict={
+                a_ph: a,
+                b_ph: b,
+                axes_ph: (a_axes, b_axes)
+            })
 
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
@@ -81,28 +83,29 @@ class TensordotTest(test_lib.TestCase):
       with self.test_session() as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           _ = sess.run(
-              [output], feed_dict={a_ph: a,
-                                   b_ph: b,
-                                   axes_ph: axes_value})
+              [output], feed_dict={
+                  a_ph: a,
+                  b_ph: b,
+                  axes_ph: axes_value
+              })
 
   # Test case for 11950
   def test_valid_axis(self):
     for axes_value in [1, 2], [[1], [2]], [[], []], 0:
       with self.test_session() as sess:
-        np_a = np.ones((3,3))
+        np_a = np.ones((3, 3))
         np_b = np.array([2, 3, 1])[None, None]
         np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3,3), dtype=dtypes.float32)
+        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
         tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
         tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
 
         self.assertAllEqual(tf_ans.shape, np_ans.shape)
         self.assertAllEqual(tf_ans, np_ans)
 
-
   def test_partial_shape_inference(self):
-    for axes in ([1],[0]), 1:
+    for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
       b = array_ops.placeholder(dtypes.float32)
       output = math_ops.tensordot(a, b, axes)
@@ -169,9 +172,11 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
           axes = array_ops.placeholder(dtypes.int32)
           c = math_ops.tensordot(a, b, axes)
           tf_ans = sess.run(
-              c, feed_dict={a: a_np,
-                            b: b_np,
-                            axes: (a_dims_np, b_dims_np)})
+              c, feed_dict={
+                  a: a_np,
+                  b: b_np,
+                  axes: (a_dims_np, b_dims_np)
+              })
         else:
           tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval()
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index efb5b9f3641..6ab931fdb97 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -58,7 +58,7 @@ class TopKTest(test.TestCase):
         # Do some special casing of equality of indices: if indices
         # are not the same, but values are floating type, ensure that
         # the values are within epsilon of each other.
-        if not np.issubdtype(np_expected_values.dtype, np.float):
+        if not np.issubdtype(np_expected_values.dtype, np.floating):
           # Values are not floating point type; check indices exactly
           self.assertAllEqual(np_expected_indices, indices)
         else:
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index f60ebf58f6f..b16c8c002c9 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -22,6 +22,7 @@ import operator
 
 import numpy as np
 
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -509,6 +510,15 @@ class VariablesTestCase(test.TestCase):
         "<tf.Variable 'noop:0' shape=(5, 5) dtype=float32_ref>",
         repr(var))
 
+  def testVariableNamesPreserveNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope("foo"):
+        v = variables.Variable(0.0, name="bar")
+      self.assertEqual(v.name, "foo/bar:0")
+    with ops.get_default_graph().as_default():
+      create_variable()
+
 
 class IsInitializedTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index c6c7c4e26cb..e152f02d8e9 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -38,9 +38,8 @@ class XentTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
-    e = np.exp(features - np.reshape(
-        np.amax(
-            features, axis=dim), one_only_on_dim))
+    e = np.exp(
+        features - np.reshape(np.amax(features, axis=dim), one_only_on_dim))
     probs = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
     bp = (probs - labels)
     l = -np.sum(labels * np.log(probs + 1.0e-20), axis=dim)
@@ -85,10 +84,10 @@ class XentTest(test.TestCase):
 
   def testRankTooLarge(self):
     for dtype in np.float16, np.float32:
-      np_features = np.array(
-          [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(dtype)
-      np_labels = np.array(
-          [[[0., 0., 0., 1.]], [[0., .5, .5, 0.]]]).astype(dtype)
+      np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3.,
+                                                    4.]]]).astype(dtype)
+      np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5,
+                                                  0.]]]).astype(dtype)
       self.assertRaisesRegexp(ValueError, "must be rank 2",
                               gen_nn_ops._softmax_cross_entropy_with_logits,
                               np_features, np_labels)
@@ -121,8 +120,8 @@ class XentTest(test.TestCase):
     # = [1.3862, 1.9401]
     np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
     self.assertAllClose(
-        np.array([[0.25, 0.25, 0.25, -0.75],
-                  [0.0321, -0.4129, -0.2632, 0.6439]]),
+        np.array([[0.25, 0.25, 0.25, -0.75], [0.0321, -0.4129, -0.2632,
+                                              0.6439]]),
         np_backprop,
         rtol=1.e-3,
         atol=1.e-3)
@@ -168,15 +167,17 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
 
       # Check that no extra computation performed. When only first derivative is requested,
       # second derivative must not be computed. So when there is no second derivative,
       # there is no `BatchMatMul` op in the graph.
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertNotIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertNotIn("BatchMatMul", op_names)
 
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
@@ -193,24 +194,29 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits_v2(labels=l, logits=f,
-                                                      name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits_v2(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(l, [3, 4], x, [3])
 
     self.assertLess(err, 5e-8)
 
   def testSecondGradient(self):
     with self.test_session() as sess:
-      l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
-                                1.0/3, 0.0, 0.0, 0.0,
-                                0.0, 0.5/3, 0.0, 0.5/3], shape=[12],
-                               dtype=dtypes.float64, name="l")
-      f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
-                                0.1, 0.4, 0.9, 1.6,
-                                0.1, 0.8, 2.7, 6.4], shape=[12],
-                               dtype=dtypes.float64, name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      l = constant_op.constant(
+          [
+              0.0, 0.0, 1.0 / 3, 0.0, 1.0 / 3, 0.0, 0.0, 0.0, 0.0, 0.5 / 3, 0.0,
+              0.5 / 3
+          ],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="l")
+      f = constant_op.constant(
+          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="f")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       loss = math_ops.reduce_sum(x)
 
       gradients = gradients_impl.gradients(loss, [f])[0]
@@ -219,20 +225,23 @@ class XentTest(test.TestCase):
 
       # Check that second derivative is calculated.
       # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertIn("BatchMatMul", op_names)
 
     print("cross entropy hessian err = ", err)
     self.assertLess(err, 5e-8)
 
   def testWrapper(self):
-    features = np.array(
-        [[[1., 1., 1., 1.], [1., 2., 3., 4.]],
-         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
-         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32)
+    features = np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
+                         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
+                         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(
+                             np.float32)
     labels = np.array([[[0., 0., 0., 1.], [0., 1., 0., 0.]],
                        [[0., 0.5, 0.5, 0.], [0.5, 0.5, 0., 0.]],
-                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(np.float32)
+                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(
+                           np.float32)
     self._testXentWrapper(features, labels, dim=0, use_gpu=False)
     self._testXentWrapper(features, labels, dim=0, use_gpu=True)
     self._testXentWrapper(features, labels, dim=1, use_gpu=False)
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 5d9feb07b44..5dea732cbaa 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import utils as layers_util
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
@@ -139,9 +140,6 @@ class Layer(object):
 
     self._init_set_name(name)
 
-    # Holds functions for creating regularizer ops.
-    self._regularizer_factories = []
-
     # Determine variable scope.
     scope = kwargs.get('_scope')
     if scope:
@@ -306,22 +304,6 @@ class Layer(object):
       inputs_hash = None
     return self._per_input_updates.get(inputs_hash, [])
 
-  def _get_regularizer_factories(self):
-    try:
-      # Some subclasses of Layer do not use its constructor.
-      return self._regularizer_factories
-    except AttributeError:
-      self._regularizer_factories = []
-      return self._regularizer_factories
-
-  def _maybe_create_variable_regularizers(self):
-    """Creates added but uninstantiated regularizers."""
-    factories = self._get_regularizer_factories()
-    if factories:
-      for factory in factories:
-        factory()
-      factories[:] = []
-
   @property
   def losses(self):
     """Losses which are associated with this `Layer`.
@@ -333,7 +315,6 @@ class Layer(object):
     Returns:
       A list of tensors.
     """
-    self._maybe_create_variable_regularizers()
     if context.in_eager_mode():
       # _losses may only contain variable regularization losses when executing
       # eagerly, and they have been saved as lambdas to be executed when
@@ -417,7 +398,6 @@ class Layer(object):
       inputs_hash = layers_util.object_list_uid(inputs)
     else:
       inputs_hash = None
-    self._maybe_create_variable_regularizers()
     return self._per_input_losses.get(inputs_hash, [])
 
   def build(self, _):
@@ -670,6 +650,7 @@ class Layer(object):
     else:
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse, auxiliary_name_scope=False)
+    input_shapes = None
     with scope_context_manager as scope:
       with ops.name_scope(self._name_scope_name(scope)):
         if not self.built:
@@ -719,6 +700,9 @@ class Layer(object):
         else:
           # Deferred mode behavior: use `compute_output_shape` to
           # infer the number of outputs of the layer and their shapes.
+          if input_shapes is None:
+            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
           output_shapes = self.compute_output_shape(input_shapes)
           output_shapes = nest.flatten(output_shapes)
           outputs = [
@@ -1414,7 +1398,10 @@ class _DeferredTensor(object):
 
   def __init__(self, shape, dtype, name=None):
     self.shape = tensor_shape.TensorShape(shape)
-    self.dtype = dtypes.as_dtype(dtype)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
     self.name = name
 
   def get_shape(self):
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e5b93a54f79..7bf62d45b8e 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -49,9 +49,6 @@ class Dense(base.Layer):
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the input to the layer has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     units: Integer or Long, dimensionality of the output space.
     activation: Activation function (callable). Set it to None to maintain a
@@ -199,9 +196,6 @@ def dense(
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the `inputs` tensor has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     inputs: Tensor input.
     units: Integer or Long, dimensionality of the output space.
@@ -230,7 +224,8 @@ def dense(
       by the same name.
 
   Returns:
-    Output tensor.
+    Output tensor the same shape as `inputs` except the last dimension is of
+    size `units`.
 
   Raises:
     ValueError: if eager execution is enabled.
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index 0a5dd57621b..745843975c4 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -621,6 +621,11 @@ class GraphNetwork(base.Layer):
         A list of loss tensors.
     """
     losses = []
+    if context.in_eager_mode():
+      for layer in self.layers:
+        losses += layer.losses
+      return losses
+
     # Retrieve losses for all internal layers.
     for layer in self.layers:
       if hasattr(layer, 'losses'):
@@ -853,7 +858,6 @@ class GraphNetwork(base.Layer):
       for node in nodes:
         # This is always a single layer, never a list.
         layer = node.outbound_layer
-
         reference_input_tensors = node.input_tensors
         reference_output_tensors = node.output_tensors
 
@@ -901,12 +905,13 @@ class GraphNetwork(base.Layer):
               else:
                 output_masks = [None for _ in range(len(output_tensors))]
 
-            # Apply activity regularizer if any:
-            if layer.activity_regularizer is not None:
-              regularization_losses = [
-                  layer.activity_regularizer(x) for x in computed_tensors
-              ]
-              layer.add_loss(regularization_losses, computed_tensors)
+            if context.in_graph_mode():
+              if layer.activity_regularizer is not None:
+                regularization_losses = [
+                    layer.activity_regularizer(x) for x in computed_tensors
+                ]
+                # Apply activity regularizer if any:
+                layer.add_loss(regularization_losses, computed_tensors)
 
           if context.in_graph_mode():
             # Update model updates and losses:
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index d3bfa0ee337..e0422ef80ad 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "numpy/arrayobject.h"
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -53,6 +54,12 @@ struct PyCall {
   // with this "token".
   string token;
 
+  // The device on which Tensors are stored; only used for EagerPyFunc.
+  Device* device;
+
+  // True if and only if the op has been placed on a GPU.
+  bool gpu;
+
   // True if the call is associated with an EagerPyFunc.
   bool eager;
 
@@ -71,7 +78,12 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     PyObject* arg = nullptr;
     const Tensor& t = call->ins[i];
     if (call->eager) {
-      arg = EagerTensorFromHandle(TFE_NewTensorHandle(t));
+      if (call->gpu) {
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, call->device));
+      } else {
+        // TFE_TensorHandle assumes that CPU is identified by `nullptr`.
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr));
+      }
       if (arg == nullptr) {
         return errors::Internal("Unable to procure EagerTensor from Tensor.");
       }
@@ -84,7 +96,8 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
     }
     PyList_SetItem(lst, i, arg);
   }
-  *tuple = Py_BuildValue("(sN)", call->token.c_str(), lst);
+  *tuple = Py_BuildValue("(sON)", call->token.c_str(),
+                         call->gpu ? Py_True : Py_False, lst);
   CHECK(*tuple);
   return Status::OK();
 }
@@ -150,15 +163,9 @@ bool IsSingleNone(PyObject* obj) {
 }
 
 // Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
-Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
-                                    Tensor* output_tensor,
-                                    TF_Status* tf_status) {
-  // TODO(akshayka): Lift the restriction requiring output tensors to
-  // lie in host memory; EagerPyFunc should be able to dispatch ops on GPU
-  // tensors, so we should eventually implement a GPU kernel for EagerPyFunc.
-  *output_tensor = *TFE_TensorHandleUnderlyingTensorInHostMemory(
-      EagerTensor_Handle(eager_tensor), tf_status);
-  return StatusFromTF_Status(tf_status);
+void ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                  Tensor* output_tensor) {
+  *output_tensor = EagerTensor_Handle(eager_tensor)->t;
 }
 
 // Calls the registered py function through the trampoline.
@@ -201,15 +208,23 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
   }
 
   // Process the return values and convert them to TF Tensors.
-  Status s;
+  Status s = Status::OK();
   if (PyList_Check(result)) {
+    // `result` is a Python list; if this operation is an `EagerPyFunc`, then
+    // every item in the list must be an `EagerTensor`; otherwise, every element
+    // must be a NumPy array.
     call->out.clear();
     for (int i = 0; i < PyList_Size(result); ++i) {
       Tensor t;
       if (call->eager) {
-        auto tf_status = tensorflow::make_safe(TF_NewStatus());
-        s = ExtractTensorFromEagerTensor(PyList_GetItem(result, i), &t,
-                                         tf_status.get());
+        const PyObject* item = PyList_GetItem(result, i);
+        if (EagerTensor_CheckExact(item)) {
+          ExtractTensorFromEagerTensor(item, &t);
+        } else {
+          s = errors::FailedPrecondition(
+              "Expected EagerTensor, found PyObject of type: ",
+              Py_TYPE(item)->tp_name);
+        }
       } else {
         s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
       }
@@ -220,16 +235,15 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       call->out.push_back(t);
     }
   } else if (EagerTensor_CheckExact(result) || result == Py_None) {
+    // result is an `EagerTensor` or `None`.
     DCHECK(call->eager);
     Tensor t;
     if (result != Py_None) {
-      auto tf_status = tensorflow::make_safe(TF_NewStatus());
-      s = ExtractTensorFromEagerTensor(result, &t, tf_status.get());
-      if (s.ok()) {
-        call->out.push_back(t);
-      }
+      ExtractTensorFromEagerTensor(result, &t);
+      call->out.push_back(t);
     }
   } else if (PyArray_Check(result)) {
+    // `result` is a NumPy array.
     DCHECK(!call->eager);
     if (!IsSingleNone(result)) {
       Tensor t;
@@ -239,7 +253,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       }
     }
   } else {
-    s = errors::Internal("Unexpected pyobject is returned: ",
+    s = errors::Internal("Unexpected PyObject was returned: ",
                          Py_TYPE(result)->tp_name);
   }
   Py_DECREF(result);
@@ -429,12 +443,24 @@ class PyFuncOp : public OpKernel {
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
     eager_ = type_string() == "EagerPyFunc";
+    gpu_ = ctx->device_type().type_string() == DEVICE_GPU;
   }
 
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
+    call.gpu = gpu_;
     call.eager = eager_;
+    if (call.eager) {
+      // Eager's C API uses `Device`, whereas `OpKernelContext` stores a
+      // `DeviceBase`; attempt to downcast.
+      call.device = dynamic_cast<Device*>(ctx->device());
+      if (call.device == nullptr) {
+        ctx->CtxFailureWithWarning(
+            errors::Internal("Unrecognized device class"));
+      }
+    }
+
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       call.ins.push_back(ctx->input(i));
     }
@@ -476,6 +502,9 @@ class PyFuncOp : public OpKernel {
  private:
   string token_;
 
+  // True if and only if this op has been placed on a GPU.
+  bool gpu_;
+
   // True if and only if this op should execute the python function eagerly,
   // i.e., if and only if the eager attribute is set.
   bool eager_;
@@ -486,5 +515,6 @@ class PyFuncOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp);
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_GPU), PyFuncOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 4e3071d8513..59f5075f177 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
 class FileIO(object):
@@ -235,6 +236,7 @@ class FileIO(object):
     self._writable_file = None
 
 
+@tf_export("gfile.Exists")
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -256,6 +258,7 @@ def file_exists(filename):
   return True
 
 
+@tf_export("gfile.Remove")
 def delete_file(filename):
   """Deletes the file located at 'filename'.
 
@@ -306,6 +309,7 @@ def write_string_to_file(filename, file_content):
     f.write(file_content)
 
 
+@tf_export("gfile.Glob")
 def get_matching_files(filename):
   """Returns a list of files that match the given pattern(s).
 
@@ -336,6 +340,7 @@ def get_matching_files(filename):
       ]
 
 
+@tf_export("gfile.MkDir")
 def create_dir(dirname):
   """Creates a directory with the name 'dirname'.
 
@@ -353,6 +358,7 @@ def create_dir(dirname):
     pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.MakeDirs")
 def recursive_create_dir(dirname):
   """Creates a directory and all parent/intermediate directories.
 
@@ -368,6 +374,7 @@ def recursive_create_dir(dirname):
     pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.Copy")
 def copy(oldpath, newpath, overwrite=False):
   """Copies data from oldpath to newpath.
 
@@ -385,6 +392,7 @@ def copy(oldpath, newpath, overwrite=False):
         compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
 
 
+@tf_export("gfile.Rename")
 def rename(oldname, newname, overwrite=False):
   """Rename or move a file / directory.
 
@@ -426,6 +434,7 @@ def atomic_write_string_to_file(filename, contents, overwrite=True):
     raise
 
 
+@tf_export("gfile.DeleteRecursively")
 def delete_recursively(dirname):
   """Deletes everything under dirname recursively.
 
@@ -439,6 +448,7 @@ def delete_recursively(dirname):
     pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.IsDirectory")
 def is_directory(dirname):
   """Returns whether the path is a directory or not.
 
@@ -452,6 +462,7 @@ def is_directory(dirname):
   return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.ListDirectory")
 def list_directory(dirname):
   """Returns a list of entries contained within a directory.
 
@@ -479,6 +490,7 @@ def list_directory(dirname):
     ]
 
 
+@tf_export("gfile.Walk")
 def walk(top, in_order=True):
   """Recursive directory tree generator for directories.
 
@@ -522,6 +534,7 @@ def walk(top, in_order=True):
     yield here
 
 
+@tf_export("gfile.Stat")
 def stat(filename):
   """Returns file statistics for a given path.
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index df190100689..48ea107a146 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -22,8 +22,10 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("python_io.TFRecordCompressionType")
 class TFRecordCompressionType(object):
   """The type of compression for the record."""
   NONE = 0
@@ -33,6 +35,7 @@ class TFRecordCompressionType(object):
 
 # NOTE(vrv): This will eventually be converted into a proto.  to match
 # the interface used by the C++ RecordWriter.
+@tf_export("python_io.TFRecordOptions")
 class TFRecordOptions(object):
   """Options used for manipulating TFRecord files."""
   compression_type_map = {
@@ -51,6 +54,7 @@ class TFRecordOptions(object):
     return cls.compression_type_map[options.compression_type]
 
 
+@tf_export("python_io.tf_record_iterator")
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
@@ -81,6 +85,7 @@ def tf_record_iterator(path, options=None):
   reader.Close()
 
 
+@tf_export("python_io.TFRecordWriter")
 class TFRecordWriter(object):
   """A class to write records to a TFRecords file.
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 24a0c186198..9541b097a94 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -34,6 +34,7 @@ See the @{$python/array_ops} guide.
 @@reshape
 @@squeeze
 @@expand_dims
+@@unravel_index
 @@meshgrid
 @@slice
 @@strided_slice
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 49191c647d5..33a92631d0b 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -261,10 +261,10 @@ def _Enter(data,
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
-      result = ref_enter(
+      result = gen_control_flow_ops._ref_enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     else:
-      result = enter(
+      result = gen_control_flow_ops._enter(
           data, frame_name, is_constant, parallel_iterations, name=name)
     if use_input_shape:
       result.set_shape(data.get_shape())
@@ -279,7 +279,7 @@ def _Enter(data,
         parallel_iterations=parallel_iterations,
         use_input_shape=use_input_shape,
         name=name)
-    indices = enter(
+    indices = gen_control_flow_ops._enter(
         data.indices,
         frame_name,
         is_constant,
@@ -290,7 +290,7 @@ def _Enter(data,
     if isinstance(data, ops.IndexedSlices):
       dense_shape = data.dense_shape
       if dense_shape is not None:
-        dense_shape = enter(
+        dense_shape = gen_control_flow_ops._enter(
             dense_shape,
             frame_name,
             is_constant,
@@ -300,7 +300,7 @@ def _Enter(data,
           dense_shape.set_shape(data.dense_shape.get_shape())
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = enter(
+      dense_shape = gen_control_flow_ops._enter(
           data.dense_shape,
           frame_name,
           is_constant,
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 314726ede6c..230b6c5946a 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index cab1025df11..22636fdbb3f 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1691,7 +1691,8 @@ def rgb_to_yiq(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
@@ -1717,7 +1718,8 @@ def yiq_to_rgb(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
@@ -1742,7 +1744,8 @@ def rgb_to_yuv(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
@@ -1768,7 +1771,8 @@ def yuv_to_rgb(images):
     images: tensor with the same shape as `images`.
   """
   images = ops.convert_to_tensor(images, name='images')
-  kernel = ops.convert_to_tensor(_yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  kernel = ops.convert_to_tensor(
+      _yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
   ndims = images.get_shape().ndims
   return math_ops.tensordot(images, kernel, axes=[[ndims-1], [0]])
 
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 98343846342..82b77ee8e37 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -252,11 +252,11 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     with self.test_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
-    
+
       x = constant_op.constant(x_np, shape=x_np.shape)
 
-      err_msg = 'Gamma should be a non-negative real number.'
-      
+      err_msg = "Gamma should be a non-negative real number."
+
       try:
         image_ops.adjust_gamma(x, gamma=-1)
       except Exception as e:
@@ -270,13 +270,13 @@ class AdjustGamma(test_util.TensorFlowTestCase):
     with self.test_session():
       x_data = np.random.uniform(0, 255, (8, 8))
       x_np = np.array(x_data, dtype=np.float32)
-    
+
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = constant_op.constant(-1.0, dtype=dtypes.float32)
-      
+
       image = image_ops.adjust_gamma(x, gamma=y)
-      
-      err_msg = 'Gamma should be a non-negative real number.'
+
+      err_msg = "Gamma should be a non-negative real number."
       try:
         image.eval()
       except Exception as e:
@@ -284,7 +284,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
           raise
       else:
         raise AssertionError("Exception not raised: %s" % err_msg)
-      
+
   def test_adjust_gamma_zero(self):
     """White image should be returned for gamma equal to zero"""
     with self.test_session():
@@ -311,13 +311,13 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 31, 45, 55, 63, 71, 78, 84],
-           [90, 95, 100, 105, 110, 115, 119, 123],
-           [127, 131, 135, 139, 142, 146, 149, 153],
-           [156, 159, 162, 165, 168, 171, 174, 177],
-           [180, 183, 186, 188, 191, 194, 196, 199],
-           [201, 204, 206, 209, 211, 214, 216, 218],
-           [221, 223, 225, 228, 230, 232, 234, 236],
+          [[0, 31, 45, 55, 63, 71, 78, 84], [
+              90, 95, 100, 105, 110, 115, 119, 123
+          ], [127, 131, 135, 139, 142, 146, 149, 153], [
+              156, 159, 162, 165, 168, 171, 174, 177
+          ], [180, 183, 186, 188, 191, 194, 196, 199], [
+              201, 204, 206, 209, 211, 214, 216, 218
+          ], [221, 223, 225, 228, 230, 232, 234, 236],
            [238, 241, 243, 245, 247, 249, 251, 253]],
           dtype=np.float32)
 
@@ -332,14 +332,12 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 0, 0, 0, 1, 1, 2, 3],
-           [4, 5, 6, 7, 9, 10, 12, 14],
-           [16, 18, 20, 22, 25, 27, 30, 33],
-           [36, 39, 42, 45, 49, 52, 56, 60],
-           [64, 68, 72, 76, 81, 85, 90, 95],
-           [100, 105, 110, 116, 121, 127, 132, 138],
-           [144, 150, 156, 163, 169, 176, 182, 189],
-           [196, 203, 211, 218, 225, 233, 241, 249]],
+          [[0, 0, 0, 0, 1, 1, 2, 3], [4, 5, 6, 7, 9, 10, 12, 14], [
+              16, 18, 20, 22, 25, 27, 30, 33
+          ], [36, 39, 42, 45, 49, 52, 56, 60], [64, 68, 72, 76, 81, 85, 90, 95],
+           [100, 105, 110, 116, 121, 127, 132, 138], [
+               144, 150, 156, 163, 169, 176, 182, 189
+           ], [196, 203, 211, 218, 225, 233, 241, 249]],
           dtype=np.float32)
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -483,8 +481,7 @@ class FlipImageBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.flip_left_right(inputs)
@@ -514,8 +511,7 @@ class FlipImageBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
@@ -566,8 +562,7 @@ class AdjustHueBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -611,8 +606,7 @@ class AdjustSaturationBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -667,10 +661,11 @@ class ResizeBilinearBenchmark(test.Benchmark):
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
-          name=("resize_bilinear_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+          name=("resize_bilinear_%s_%s_%s" % (image_size[0], image_size[1],
+                                              num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -717,8 +712,9 @@ class ResizeBicubicBenchmark(test.Benchmark):
           min_iters=20,
           name=("resize_bicubic_%s_%s_%s" % (image_size[0], image_size[1],
                                              num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -754,8 +750,8 @@ class ResizeAreaBenchmark(test.Benchmark):
     batch_size = 1
     num_ops = 1000
     img = variables.Variable(
-        random_ops.random_normal([batch_size, image_size[0],
-                                  image_size[1], num_channels]),
+        random_ops.random_normal(
+            [batch_size, image_size[0], image_size[1], num_channels]),
         name="img")
 
     deps = []
@@ -768,12 +764,13 @@ class ResizeAreaBenchmark(test.Benchmark):
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
-          sess, benchmark_op,
-          name=("resize_area_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (
-          results["name"],
-          1000*results["wall_time"] / (batch_size * num_ops)))
+          sess,
+          benchmark_op,
+          name=("resize_area_%s_%s_%s" % (image_size[0], image_size[1],
+                                          num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -847,8 +844,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
     saturation_adjusted_image = gen_image_ops.adjust_saturation(
         flt_image, saturation_factor)
-    return image_ops.convert_image_dtype(saturation_adjusted_image,
-                                         orig_dtype)
+    return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype)
 
   def testHalfSaturationFused(self):
     x_shape = [2, 2, 3]
@@ -953,7 +949,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith('flip_left_right'))
+      self.assertTrue(y.op.name.startswith("flip_left_right"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -964,7 +960,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith('random_flip_left_right'))
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
       count_unflipped = 0
@@ -995,7 +991,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith('flip_up_down'))
+      self.assertTrue(y.op.name.startswith("flip_up_down"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -1006,7 +1002,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith('random_flip_up_down'))
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
       for _ in range(50):
@@ -1036,7 +1032,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      self.assertTrue(y.op.name.startswith('transpose_image'))
+      self.assertTrue(y.op.name.startswith("transpose_image"))
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
@@ -1261,7 +1257,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
-      self.assertTrue(y.op.name.startswith('per_image_standardization'))
+      self.assertTrue(y.op.name.startswith("per_image_standardization"))
       y_tf = y.eval()
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
@@ -1433,9 +1429,10 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     # Each line is a test configuration:
     #   (offset_height, offset_width, target_height, target_width), err_msg
-    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"),
-                   ([0, -1, 3, 3], "offset_width must be >= 0"),
-                   ([0, 0, 0, 3], "target_height must be > 0"),
+    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), ([
+        0, -1, 3, 3
+    ], "offset_width must be >= 0"), ([0, 0, 0, 3],
+                                      "target_height must be > 0"),
                    ([0, 0, 3, 0], "target_width must be > 0"),
                    ([2, 0, 3, 3], "height must be >= target + offset"),
                    ([0, 2, 3, 3], "width must be >= target + offset"))
@@ -1446,7 +1443,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.name.startswith('crop_to_bounding_box'))
+    self.assertTrue(y.name.startswith("crop_to_bounding_box"))
 
 
 class CentralCropTest(test_util.TensorFlowTestCase):
@@ -1471,9 +1468,10 @@ class CentralCropTest(test_util.TensorFlowTestCase):
 
   def testCropping(self):
     x_shape = [4, 8, 1]
-    x_np = np.array([[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
-                     [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
-                    dtype=np.int32).reshape(x_shape)
+    x_np = np.array(
+        [[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
+         [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
+        dtype=np.int32).reshape(x_shape)
     y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1])
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
@@ -1490,7 +1488,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
       y = image_ops.central_crop(x, 0.33)
-      y_tf = y.eval(feed_dict={x:x_np})
+      y_tf = y.eval(feed_dict={x: x_np})
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1529,7 +1527,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     x_np = np.ones(x_shape, dtype=np.float32)
     with self.test_session(use_gpu=True):
       y = image_ops.central_crop(x_np, 1.0)
-      self.assertTrue(y.op.name.startswith('central_crop'))
+      self.assertTrue(y.op.name.startswith("central_crop"))
 
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
@@ -1602,15 +1600,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
   def testInt64(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     x = np.array(x).reshape(x_shape)
     y = np.array(y).reshape(y_shape)
@@ -1627,38 +1620,26 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
   def testPadding(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
     offset_height, offset_width = [1, 0]
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 1]
-    y = [0, 1, 2, 3,
-         0, 4, 5, 6,
-         0, 7, 8, 9]
+    y = [0, 1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9,
-         0, 0, 0]
+    y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3, 0,
-         4, 5, 6, 0,
-         7, 8, 9, 0]
+    y = [1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9, 0]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
@@ -1690,9 +1671,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
     #   x_shape, target_height, target_width
-    test_config = (([0, 2, 2], 2, 2),
-                   ([2, 0, 2], 2, 2),
-                   ([2, 2, 0], 2, 2))
+    test_config = (([0, 2, 2], 2, 2), ([2, 0, 2], 2, 2), ([2, 2, 0], 2, 2))
     offset_height, offset_width = [0, 0]
     x = []
 
@@ -1737,7 +1716,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.op.name.startswith('pad_to_bounding_box'))
+    self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
 
 
 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
@@ -1750,8 +1729,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
                               (bounding_box[2] - bounding_box[0]))
 
     image_size_np = np.array(image.shape, dtype=np.int32)
-    bounding_box_np = (np.array(
-        bounding_box, dtype=np.float32).reshape([1, 1, 4]))
+    bounding_box_np = (
+        np.array(bounding_box, dtype=np.float32).reshape([1, 1, 4]))
 
     aspect_ratios = []
     area_ratios = []
@@ -1796,7 +1775,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval(feed_dict={min_object_covered_placeholder: min_object_covered})
+        y_tf = y.eval(feed_dict={
+            min_object_covered_placeholder: min_object_covered
+        })
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -1890,7 +1871,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       bounding_box = constant_op.constant(
           [0.0, 0.0, 1.0, 1.0],
           shape=[4],
-          dtype=dtypes.float32,)
+          dtype=dtypes.float32,
+      )
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
           bounding_boxes=bounding_box,
@@ -1937,13 +1919,15 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
-  OPTIONS = [image_ops.ResizeMethod.BILINEAR,
-             image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-             image_ops.ResizeMethod.BICUBIC,
-             image_ops.ResizeMethod.AREA]
+  OPTIONS = [
+      image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+      image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA
+  ]
 
-  TYPES = [np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-           np.float16, np.float32, np.float64]
+  TYPES = [
+      np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.float16,
+      np.float32, np.float64
+  ]
 
   def _assertShapeInference(self, pre_shape, size, post_shape):
     # Try single image resize
@@ -1971,12 +1955,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     target_height = 6
     target_width = 4
 
@@ -2007,12 +1989,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     new_size = array_ops.placeholder(dtypes.int32, shape=(2))
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
@@ -2066,8 +2046,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.BILINEAR)
 
   def testReturnDtype(self):
-    target_shapes = [[6, 4], [3, 2], [array_ops.placeholder(dtypes.int32),
-                                      array_ops.placeholder(dtypes.int32)]]
+    target_shapes = [[6, 4], [3, 2], [
+        array_ops.placeholder(dtypes.int32),
+        array_ops.placeholder(dtypes.int32)
+    ]]
     for nptype in self.TYPES:
       image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
       for opt in self.OPTIONS:
@@ -2084,12 +2066,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     img_shape = [1, 6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     # Test size where width is specified as a tensor which is a sum
     # of two tensors.
     width_1 = constant_op.constant(1)
@@ -2111,15 +2091,11 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   def testResizeDown(self):
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
-    expected_data = [127, 64,
-                     64, 127,
-                     50, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    expected_data = [127, 64, 64, 127, 50, 100]
     target_height = 3
     target_width = 2
 
@@ -2145,39 +2121,31 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
-    data = [64, 32,
-            32, 64,
-            50, 100]
+    data = [64, 32, 32, 64, 50, 100]
     target_height = 6
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        64.0, 48.0, 32.0, 32.0,
-        48.0, 48.0, 48.0, 48.0,
-        32.0, 48.0, 64.0, 64.0,
-        41.0, 61.5, 82.0, 82.0,
-        50.0, 75.0, 100.0, 100.0,
-        50.0, 75.0, 100.0, 100.0]
+        64.0, 48.0, 32.0, 32.0, 48.0, 48.0, 48.0, 48.0, 32.0, 48.0, 64.0, 64.0,
+        41.0, 61.5, 82.0, 82.0, 50.0, 75.0, 100.0, 100.0, 50.0, 75.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.AREA] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA]:
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
+      ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
@@ -2190,41 +2158,29 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersTrue(self):
     img_shape = [1, 3, 2, 1]
-    data = [6, 3,
-            3, 6,
-            6, 9]
+    data = [6, 3, 3, 6, 6, 9]
     target_height = 5
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        6.0, 5.0, 4.0, 3.0,
-        4.5, 4.5, 4.5, 4.5,
-        3.0, 4.0, 5.0, 6.0,
-        4.5, 5.5, 6.5, 7.5,
-        6.0, 7.0, 8.0, 9.0
+        6.0, 5.0, 4.0, 3.0, 4.5, 4.5, 4.5, 4.5, 3.0, 4.0, 5.0, 6.0, 4.5, 5.5,
+        6.5, 7.5, 6.0, 7.0, 8.0, 9.0
     ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        6.0, 6.0, 3.0, 3.0,
-        3.0, 3.0, 6.0, 6.0,
-        3.0, 3.0, 6.0, 6.0,
-        6.0, 6.0, 9.0, 9.0,
-        6.0, 6.0, 9.0, 9.0
+        6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0,
+        9.0, 9.0, 6.0, 6.0, 9.0, 9.0
     ]
     # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
     # align_corners=True.
     expected_data[image_ops.ResizeMethod.AREA] = [
-        6.0, 6.0, 6.0, 3.0,
-        6.0, 6.0, 6.0, 3.0,
-        3.0, 3.0, 3.0, 6.0,
-        3.0, 3.0, 3.0, 6.0,
-        6.0, 6.0, 6.0, 9.0
+        6.0, 6.0, 6.0, 3.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 3.0, 3.0,
+        3.0, 6.0, 6.0, 6.0, 6.0, 9.0
     ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
       ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
@@ -2238,23 +2194,21 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpBicubic(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 128, 64, 64, 128, 128, 64, 64,
-            64, 64, 128, 128, 64, 64, 128, 128,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        128, 128, 64, 64, 128, 128, 64, 64, 64, 64, 128, 128, 64, 64, 128, 128,
+        50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100,
+        50, 50, 100, 100
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 8
     target_width = 8
-    expected_data = [128, 135, 96, 55, 64, 114, 134, 128,
-                     78, 81, 68, 52, 57, 118, 144, 136,
-                     55, 49, 79, 109, 103, 89, 83, 84,
-                     74, 70, 95, 122, 115, 69, 49, 55,
-                     100, 105, 75, 43, 50, 89, 105, 100,
-                     57, 54, 74, 96, 91, 65, 55, 58,
-                     70, 69, 75, 81, 80, 72, 69, 70,
-                     105, 112, 75, 36, 45, 92, 111, 105]
+    expected_data = [
+        128, 135, 96, 55, 64, 114, 134, 128, 78, 81, 68, 52, 57, 118, 144, 136,
+        55, 49, 79, 109, 103, 89, 83, 84, 74, 70, 95, 122, 115, 69, 49, 55, 100,
+        105, 75, 43, 50, 89, 105, 100, 57, 54, 74, 96, 91, 65, 55, 58, 70, 69,
+        75, 81, 80, 72, 69, 70, 105, 112, 75, 36, 45, 92, 111, 105
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2267,20 +2221,17 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeDownArea(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 64, 32, 16, 8, 4,
-            4, 8, 16, 32, 64, 128,
-            128, 64, 32, 16, 8, 4,
-            5, 10, 15, 20, 25, 30,
-            30, 25, 20, 15, 10, 5,
-            5, 10, 15, 20, 25, 30]
+    data = [
+        128, 64, 32, 16, 8, 4, 4, 8, 16, 32, 64, 128, 128, 64, 32, 16, 8, 4, 5,
+        10, 15, 20, 25, 30, 30, 25, 20, 15, 10, 5, 5, 10, 15, 20, 25, 30
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 4
     target_width = 4
-    expected_data = [73, 33, 23, 39,
-                     73, 33, 23, 39,
-                     14, 16, 19, 21,
-                     14, 16, 19, 21]
+    expected_data = [
+        73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2367,7 +2318,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith('resize_images'))
+      self.assertTrue(y.op.name.startswith("resize_images"))
 
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
@@ -2440,133 +2391,93 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
   def testPad(self):
     # Pad even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0,
-         0, 5, 6, 7, 8, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 8, 0]
     y_shape = [2, 6, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0, 0,
-         0, 5, 6, 7, 8, 0, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 0, 5, 6, 7, 8, 0, 0]
     y_shape = [2, 7, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad even along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0]
     y_shape = [4, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0]
     y_shape = [5, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCrop(self):
     # Crop even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [2, 3,
-         6, 7]
+    y = [2, 3, 6, 7]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along col.
-    x = [1, 2, 3, 4,  5,  6,
-         7, 8, 9, 10, 11, 12]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     x_shape = [2, 6, 1]
 
-    y = [2, 3, 4,
-         8, 9, 10]
+    y = [2, 3, 4, 8, 9, 10]
     y_shape = [2, 3, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop even along row.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [3, 4,
-         5, 6]
+    y = [3, 4, 5, 6]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along row.
-    x = [1,  2,
-         3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12,
-         13, 14,
-         15, 16]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
     x_shape = [8, 2, 1]
 
-    y = [3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12]
+    y = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     y_shape = [5, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCropAndPad(self):
     # Pad along row but crop along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0,
-         2, 3,
-         6, 7,
-         0, 0]
+    y = [0, 0, 2, 3, 6, 7, 0, 0]
     y_shape = [4, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop along row but pad along col.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [0, 3, 4, 0,
-         0, 5, 6, 0]
+    y = [0, 3, 4, 0, 0, 5, 6, 0]
     y_shape = [2, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
@@ -2647,7 +2558,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
     y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
-    self.assertTrue(y.op.name.startswith('resize_image_with_crop_or_pad'))
+    self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad"))
 
 
 def _SimpleColorRamp():
@@ -2916,8 +2827,8 @@ class GifTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       gif = io_ops.read_file(filename)
       image = image_ops.decode_gif(gif)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError, "can't process optimized gif"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "can't process optimized gif"):
         gif, image = sess.run([gif, image])
 
   def testValid(self):
@@ -2979,8 +2890,9 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       y = image_ops.convert_image_dtype(image, output_dtype)
       self.assertTrue(y.dtype == output_dtype)
       self.assertAllClose(y.eval(), y_np, atol=1e-5)
-      if output_dtype in [dtypes.float32, dtypes.float64,
-                          dtypes.int32, dtypes.int64]:
+      if output_dtype in [
+          dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64
+      ]:
         y_saturate = image_ops.convert_image_dtype(
             image, output_dtype, saturate=True)
         self.assertTrue(y_saturate.dtype == output_dtype)
@@ -3000,8 +2912,8 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128])
       self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255])
-      self._convert([0, 2 ** 32], dtypes.int64, dtypes.int32, [0, 1])
-      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2 ** 32])
+      self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
+      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
@@ -3022,20 +2934,14 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   def testConvertBetweenInt16AndInt8(self):
     with self.test_session(use_gpu=True):
       # uint8, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8,
-                    [0, 255])
-      self._convert([0, 255], dtypes.uint8, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, [0, 255])
+      self._convert([0, 255], dtypes.uint8, dtypes.uint16, [0, 255 * 256])
       # int8, uint16
-      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8,
-                    [0, 127])
-      self._convert([0, 127], dtypes.int8, dtypes.uint16,
-                    [0, 127 * 2 * 256])
+      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8, [0, 127])
+      self._convert([0, 127], dtypes.int8, dtypes.uint16, [0, 127 * 2 * 256])
       # int16, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16,
-                    [0, 255 * 128])
-      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16, [0, 255 * 128])
+      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16, [0, 255 * 256])
 
 
 class TotalVariationTest(test_util.TensorFlowTestCase):
@@ -3168,20 +3074,17 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     # The following are the sum of absolute differences between the pixels.
     # sum row dif = (4-1) + (7-2) = 3 + 5 = 8
     # sum col dif = (2-1) + (7-4) = 1 + 3 = 4
-    r = [[1, 2],
-         [4, 7]]
+    r = [[1, 2], [4, 7]]
 
     # Blue color channel.
     # sum row dif = 18 + 29 = 47
     # sum col dif = 7 + 18 = 25
-    g = [[11, 18],
-         [29, 47]]
+    g = [[11, 18], [29, 47]]
 
     # Green color channel.
     # sum row dif = 120 + 193 = 313
     # sum col dif = 47 + 120 = 167
-    b = [[73, 120],
-         [193, 313]]
+    b = [[73, 120], [193, 313]]
 
     # Combine the 3 color channels into a single 3-dim array.
     # The shape is (2, 2, 3) corresponding to (height, width and color).
@@ -3210,9 +3113,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
     # Combine these 3 images into a single array of shape (3, 2, 2, 3)
     # where the first dimension is for the image-number.
-    multi = np.vstack((a[np.newaxis, :],
-                       b[np.newaxis, :],
-                       c[np.newaxis, :]))
+    multi = np.vstack((a[np.newaxis, :], b[np.newaxis, :], c[np.newaxis, :]))
 
     # Check that TensorFlow correctly calculates the total variation
     # for each image individually and returns the correct array.
@@ -3268,6 +3169,46 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  def testInvalidShape(self):
+    # The boxes should be 2D of shape [num_boxes, 4].
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 2 but is rank 1'):
+      boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Dimension must be 4 but is 3'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    # The scores should be 1D of shape [num_boxes].
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 1 but is rank 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([[0.9]])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    # The max_output_size should be a scaler (0-D).
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, [3], 0.5)
+
+    # The iou_threshold should be a scaler (0-D).
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, [[0.5]])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index db33a08137e..a5096ffdd9c 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -65,8 +65,8 @@ def logdet(matrix, name=None):
   ```
 
   Args:
-    matrix:  A `Tensor`. Must be `float32`, `float64`, `complex64`, or
-      `complex128` with shape `[..., M, M]`.
+    matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
+      or `complex128` with shape `[..., M, M]`.
     name:  A name to give this `Op`.  Defaults to `logdet`.
 
   Returns:
@@ -99,8 +99,8 @@ def adjoint(matrix, name=None):
                         #  [3 - 3j, 6 - 6j]]
 
   Args:
-    matrix:  A `Tensor`. Must be `float32`, `float64`, `complex64`, or
-      `complex128` with shape `[..., M, M]`.
+    matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
+      or `complex128` with shape `[..., M, M]`.
     name:  A name to give this `Op` (optional).
 
   Returns:
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 27e0f17020a..8339c940af8 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -478,7 +478,6 @@ class LinearOperator(object):
           cond,
           self._max_condition_number_to_be_non_singular(),
           message="Singular matrix up to precision epsilon.")
-    raise NotImplementedError("assert_non_singular is not implemented.")
 
   def _max_condition_number_to_be_non_singular(self):
     """Return the maximum condition number that we consider nonsingular."""
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index a4724d030f3..2217bfd5459 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -121,8 +121,8 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
 
     Args:
       diag:  Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-        The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-          `complex64`, `complex128`.
+        The diagonal of the operator.  Allowed dtypes: `float16`, `float32`,
+          `float64`, `complex64`, `complex128`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `diag.dtype` is real, this is auto-set to `True`.
@@ -167,7 +167,12 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   def _check_diag(self, diag):
     """Static check of diag."""
     allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.complex64,
+        dtypes.complex128,
+    ]
 
     dtype = diag.dtype
     if dtype not in allowed_dtypes:
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index dd4c7cb0413..8fb59ca1a7e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -114,7 +114,8 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
 
     Args:
       matrix:  Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`.
-        Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`.
+        Allowed dtypes: `float16`, `float32`, `float64`, `complex64`,
+        `complex128`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.
@@ -147,7 +148,12 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   def _check_matrix(self, matrix):
     """Static check of the `matrix` argument."""
     allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.complex64,
+        dtypes.complex128,
+    ]
 
     matrix = ops.convert_to_tensor(matrix, name="matrix")
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index ad3bb2efa94..36eed89db60 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -150,8 +150,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     `is_X` matrix property hints, which will trigger the appropriate code path.
 
     Args:
-      base_operator:  Shape `[B1,...,Bb, M, N]` real `float32` or `float64`
-        `LinearOperator`.  This is `L` above.
+      base_operator:  Shape `[B1,...,Bb, M, N]` real `float16`, `float32` or
+        `float64` `LinearOperator`.  This is `L` above.
       u:  Shape `[B1,...,Bb, M, K]` `Tensor` of same `dtype` as `base_operator`.
         This is `U` above.
       diag_update:  Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype`
@@ -188,7 +188,11 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     #    because if diag has non-zero imaginary part, it will not be
     #    self-adjoint positive definite.
     dtype = base_operator.dtype
-    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    allowed_dtypes = [
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+    ]
     if dtype not in allowed_dtypes:
       raise TypeError(
           "Argument matrix must have dtype in %s.  Found: %s"
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index 6ea55f0367b..6419030755f 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -118,7 +118,8 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     Args:
       tril:  Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
         The lower triangular part of `tril` defines this operator.  The strictly
-        upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
+        upper triangle is ignored.  Allowed dtypes: `float16`, `float32`,
+        `float64`.
       is_non_singular:  Expect that this operator is non-singular.
         This operator is non-singular if and only if its diagonal elements are
         all non-zero.
@@ -164,7 +165,11 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     """Static check of the `tril` argument."""
     # TODO(langmore) Add complex types once matrix_triangular_solve works for
     # them.
-    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    allowed_dtypes = [
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+    ]
     dtype = tril.dtype
     if dtype not in allowed_dtypes:
       raise TypeError(
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 72508eb4350..8b3c61b9339 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -28,8 +28,10 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.Reduction")
 class Reduction(object):
   """Types of loss reduction.
 
@@ -149,9 +151,10 @@ def _num_present(losses, weights, per_batch=False):
 def _num_elements(losses):
   """Computes the number of elements in `losses` tensor."""
   with ops.name_scope(None, "num_elements", values=[losses]) as scope:
-    return array_ops.size(losses, name=scope, out_type=losses.dtype)
+    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
+@tf_export("losses.compute_weighted_loss")
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -211,6 +214,7 @@ def compute_weighted_loss(
       return loss
 
 
+@tf_export("losses.absolute_difference")
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -258,6 +262,7 @@ def absolute_difference(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.cosine_distance")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -311,6 +316,7 @@ def cosine_distance(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.hinge_loss")
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -352,6 +358,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.huber_loss")
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -420,6 +427,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.log_loss")
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -471,6 +479,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 
 # TODO(b/37208492): Add reduction arg.
+@tf_export("losses.mean_pairwise_squared_error")
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -538,12 +547,13 @@ def mean_pairwise_squared_error(
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
       term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                              num_present_per_batch)
+                              num_present_per_batch-1)
 
       sum_diff = math_ops.reduce_sum(
           diffs, reduction_indices=reduction_indices, keep_dims=True)
-      term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                              math_ops.square(num_present_per_batch))
+      term2 = 2.0 * _safe_div(
+          math_ops.square(sum_diff),
+          math_ops.multiply(num_present_per_batch, num_present_per_batch-1))
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
@@ -557,6 +567,7 @@ def mean_pairwise_squared_error(
       return mean_loss
 
 
+@tf_export("losses.mean_squared_error")
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -604,6 +615,7 @@ def mean_squared_error(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.sigmoid_cross_entropy")
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -662,6 +674,7 @@ def sigmoid_cross_entropy(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.softmax_cross_entropy")
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -771,6 +784,7 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
+@tf_export("losses.sparse_softmax_cross_entropy")
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 3718c481c26..b835d963869 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -30,8 +30,10 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.add_loss")
 def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
   """Adds a externally defined loss to the collection of losses.
 
@@ -43,6 +45,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     ops.add_to_collection(loss_collection, loss)
 
 
+@tf_export("losses.get_losses")
 def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
@@ -56,6 +59,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   return ops.get_collection(loss_collection, scope)
 
 
+@tf_export("losses.get_regularization_losses")
 def get_regularization_losses(scope=None):
   """Gets the list of regularization losses.
 
@@ -68,6 +72,7 @@ def get_regularization_losses(scope=None):
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
+@tf_export("losses.get_regularization_loss")
 def get_regularization_loss(scope=None, name="total_regularization_loss"):
   """Gets the total regularization loss.
 
@@ -85,6 +90,7 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
     return constant_op.constant(0.0)
 
 
+@tf_export("losses.get_total_loss")
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
diff --git a/tensorflow/python/ops/manip_grad.py b/tensorflow/python/ops/manip_grad.py
new file mode 100644
index 00000000000..573e8c0a0d4
--- /dev/null
+++ b/tensorflow/python/ops/manip_grad.py
@@ -0,0 +1,32 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gradients for operators defined in manip_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import manip_ops
+
+
+@ops.RegisterGradient("Roll")
+def _RollGrad(op, grad):
+  # The gradient is just the roll reversed
+  shift = op.inputs[1]
+  axis = op.inputs[2]
+  roll_grad = manip_ops.roll(grad, -shift, axis)
+  return roll_grad, None, None
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
new file mode 100644
index 00000000000..c5f39784f43
--- /dev/null
+++ b/tensorflow/python/ops/manip_ops.py
@@ -0,0 +1,36 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators for manipulating tensors.
+
+@@roll
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+# pylint: disable=protected-access
+def roll(input, shift, axis):
+  return _gen_manip_ops.roll(input, shift, axis)
+
+roll.__doc__ = _gen_manip_ops.roll.__doc__
+# pylint: enable=protected-access
+
+_allowed_symbols = ['roll']
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index bca4c665d27..53308484c42 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -40,15 +40,16 @@ def _SumGrad(op, grad):
   """Gradient for Sum."""
   # Fast path for when reducing to a scalar and ndims is known: adds only
   # Reshape and Tile ops (and possibly a Shape).
-  if op.inputs[0].get_shape().ndims is not None:
+  input_0_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if input_0_shape is not None:
     axes = tensor_util.constant_value(op.inputs[1])
     if axes is not None:
-      rank = op.inputs[0].get_shape().ndims
+      rank = len(input_0_shape)
       if np.array_equal(axes, np.arange(rank)):  # Reduce all dims.
         grad = array_ops.reshape(grad, [1] * rank)
         # If shape is not fully defined (but rank is), we use Shape.
-        if op.inputs[0].get_shape().is_fully_defined():
-          input_shape = op.inputs[0].get_shape().as_list()
+        if None not in input_0_shape:
+          input_shape = input_0_shape
         else:
           input_shape = array_ops.shape(op.inputs[0])
         return [array_ops.tile(grad, input_shape), None]
@@ -96,9 +97,12 @@ def _MinGrad(op, grad):
 def _MeanGrad(op, grad):
   """Gradient for Mean."""
   sum_grad = _SumGrad(op, grad)[0]
-  input_size = op.inputs[0].get_shape().num_elements()
-  output_size = op.outputs[0].get_shape().num_elements()
-  if input_size is not None and output_size is not None:
+  input_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  output_shape = op.outputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if (input_shape is not None and output_shape is not None and
+      None not in input_shape and None not in output_shape):
+    input_size = np.prod(input_shape)
+    output_size = np.prod(output_shape)
     factor = input_size // max(output_size, 1)
     factor = constant_op.constant(factor, dtype=sum_grad.dtype)
   else:
@@ -106,7 +110,7 @@ def _MeanGrad(op, grad):
     output_shape = array_ops.shape(op.outputs[0])
     factor = _safe_shape_div(
         math_ops.reduce_prod(input_shape), math_ops.reduce_prod(output_shape))
-  return sum_grad / math_ops.cast(factor, sum_grad.dtype), None
+  return math_ops.truediv(sum_grad, math_ops.cast(factor, sum_grad.dtype)), None
 
 
 @ops.RegisterGradient("Prod")
@@ -169,8 +173,7 @@ def _SegmentMeanGrad(op, grad):
       array_ops.shape(op.inputs[1]),
       array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)
   ], 0)
-  ones = array_ops.fill(ones_shape,
-                        constant_op.constant(1, dtype=grad.dtype))
+  ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype))
   scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
@@ -226,16 +229,19 @@ def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
 
 
 def _SegmentMinOrMaxGrad(op, grad, is_sorted):
-  """Gradient for SegmentMin and (unsorted) SegmentMax. They share similar code."""
-  zeros = array_ops.zeros(array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype)
+  """Gradient for SegmentMin and (unsorted) SegmentMax.
+
+  They share similar code.
+  """
+  zeros = array_ops.zeros(
+      array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype)
 
   # Get the number of selected (minimum or maximum) elements in each segment.
   gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
   is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
   if is_sorted:
-    num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
-                                        op.inputs[1])
+    num_selected = math_ops.segment_sum(
+        math_ops.cast(is_selected, grad.dtype), op.inputs[1])
   else:
     num_selected = math_ops.unsorted_segment_sum(
         math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
@@ -330,7 +336,7 @@ def _SquareGrad(op, grad):
   # Added control dependencies to prevent 2*x from being computed too early.
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    return grad * (2.0 * x)
+    return math_ops.multiply(grad, math_ops.multiply(x, 2.0))
 
 
 @ops.RegisterGradient("Sqrt")
@@ -532,8 +538,8 @@ def _IgammaGrad(op, grad):
   # and Gamma'(a) can grow large.
   partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) - math_ops.lgamma(a))
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (None, array_ops.reshape(
+      math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Igammac")
@@ -559,15 +565,17 @@ def _BetaincGrad(op, grad):
 
   # Perform operations in log space before summing, because terms
   # can grow large.
-  log_beta = (gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b)
-              - gen_math_ops.lgamma(a + b))
-  partial_x = math_ops.exp(
-      (b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta)
+  log_beta = (
+      gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) -
+      gen_math_ops.lgamma(a + b))
+  partial_x = math_ops.exp((b - 1) * math_ops.log(1 - x) +
+                           (a - 1) * math_ops.log(x) - log_beta)
 
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,  # da
-          None,  # db
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (
+      None,  # da
+      None,  # db
+      array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Zeta")
@@ -731,10 +739,8 @@ def _ShapesFullySpecifiedAndEqual(x, y, grad):
   y_shape = y._shape_tuple()
   grad_shape = grad._shape_tuple()
   # pylint: enable=protected-access
-  return (x_shape == y_shape and
-          x_shape == grad_shape and
-          x_shape is not None and
-          None not in x_shape)
+  return (x_shape == y_shape and x_shape == grad_shape and
+          x_shape is not None and None not in x_shape)
 
 
 @ops.RegisterGradient("Add")
@@ -756,8 +762,12 @@ def _AddGrad(op, grad):
 
 @ops.RegisterGradient("Sub")
 def _SubGrad(op, grad):
+  """Gradient for Sub."""
   x = op.inputs[0]
   y = op.inputs[1]
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return grad, -grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
   # pylint: disable=protected-access
@@ -848,10 +858,10 @@ def _RealDivGrad(op, grad):
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
-      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx),
-      sx), array_ops.reshape(
-          math_ops.reduce_sum(grad * math_ops.realdiv(math_ops.realdiv(-x, y), y),
-                              ry), sy))
+      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(
+                  grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy))
 
 
 @ops.RegisterGradient("Pow")
@@ -946,8 +956,8 @@ def _SelectGrad(op, grad):
   c = op.inputs[0]
   x = op.inputs[1]
   zeros = array_ops.zeros_like(x)
-  return (None, array_ops.where(c, grad, zeros),
-          array_ops.where(c, zeros, grad))
+  return (None, array_ops.where(c, grad, zeros), array_ops.where(
+      c, zeros, grad))
 
 
 @ops.RegisterGradient("MatMul")
@@ -1009,21 +1019,20 @@ def _SparseMatMulGrad(op, grad):
   dtype_a = op.inputs[0].dtype
   dtype_b = op.inputs[1].dtype
   if not t_a and not t_b:
-    return (_SparseMatMul(
-        grad, op.inputs[1], dtype_a, transpose_b=True), _SparseMatMul(
-            op.inputs[0], grad, dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a, transpose_b=True),
+            _SparseMatMul(op.inputs[0], grad, dtype_b, transpose_a=True))
   elif not t_a and t_b:
-    return (_SparseMatMul(grad, op.inputs[1], dtype_a), _SparseMatMul(
-        grad, op.inputs[0], dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a),
+            _SparseMatMul(grad, op.inputs[0], dtype_b, transpose_a=True))
   elif t_a and not t_b:
-    return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_b=True),
+    return (_SparseMatMul(op.inputs[1], grad, dtype_a, transpose_b=True),
             _SparseMatMul(op.inputs[0], grad, dtype_b))
   elif t_a and t_b:
     return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_a=True,
-        transpose_b=True), _SparseMatMul(
-            grad, op.inputs[0], dtype_b, transpose_a=True, transpose_b=True))
+        op.inputs[1], grad, dtype_a, transpose_a=True, transpose_b=True),
+            _SparseMatMul(
+                grad, op.inputs[0], dtype_b, transpose_a=True,
+                transpose_b=True))
 
 
 @ops.RegisterGradient("Floor")
@@ -1127,8 +1136,8 @@ def _ComplexAbsGrad(op, grad):
   """Returns the gradient of ComplexAbs."""
   # TODO(b/27786104): The cast to complex could be removed once arithmetic
   # supports mixtures of complex64 and real values.
-  return (math_ops.complex(grad, array_ops.zeros_like(grad)) *
-          math_ops.sign(op.inputs[0]))
+  return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign(
+      op.inputs[0]))
 
 
 @ops.RegisterGradient("Cast")
@@ -1158,8 +1167,8 @@ def _CumsumGrad(op, grad):
   exclusive = op.get_attr("exclusive")
   reverse = op.get_attr("reverse")
   return [
-      math_ops.cumsum(
-          grad, axis, exclusive=exclusive, reverse=not reverse), None
+      math_ops.cumsum(grad, axis, exclusive=exclusive, reverse=not reverse),
+      None
   ]
 
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b8e8207bb24..9a8ac93de9d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1841,12 +1841,11 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
             axis,
-            keepdims=True,
-            reduction_indices=reduction_indices)) + my_max
+            keepdims=keepdims,
+            reduction_indices=reduction_indices))
     if not keepdims:
-      if isinstance(axis, int):
-        axis = [axis]
-      result = array_ops.squeeze(result, axis)
+      my_max = array_ops.reshape(my_max, array_ops.shape(result))
+    result += my_max
     return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index f95cf08de1a..6e5fe74290a 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -95,8 +95,8 @@ class MatmulBenchmark(test.Benchmark):
         num_items = n * m * k * 2
         throughput = num_items * num_iters / duration / 1e9
         print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' %
-              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:'
-               + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
+              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) +
+               ',ta:' + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
                duration, throughput))
 
     name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}')
@@ -112,7 +112,8 @@ class MatmulBenchmark(test.Benchmark):
     return duration
 
   def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
-    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)
+    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b,
+                   num_iters, dtype)
 
   def test_round(self, num_iters):
     dtypes = [np.float32, np.float64]
@@ -124,8 +125,8 @@ class MatmulBenchmark(test.Benchmark):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for n, m, k, (transpose_a, transpose_b) in itertools.product(
-          [200], [1, 8, 20], [10000], [(False, False), (True, False), (False,
-                                                                       True)]):
+          [200], [1, 8, 20], [10000], [(False, False), (True, False),
+                                       (False, True)]):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for (n, m, k), (transpose_a, transpose_b) in itertools.product(
diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py
index 5a9c0a7a495..3df0c66ef9c 100644
--- a/tensorflow/python/ops/matmul_benchmark_test.py
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@@ -33,11 +33,11 @@ def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping BuildGraphTest %s", (n, m, k, transpose_a,
-                                                     transpose_b))
+      tf_logging.info("Skipping BuildGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing BuildGraphTest %s", (n, m, k, transpose_a,
-                                                  transpose_b))
+    tf_logging.info("Testing BuildGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -47,11 +47,11 @@ def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping RunGraphTest %s", (n, m, k, transpose_a,
-                                                   transpose_b))
+      tf_logging.info("Skipping RunGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing RunGraphTest %s", (n, m, k, transpose_a,
-                                                transpose_b))
+    tf_logging.info("Testing RunGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -71,40 +71,41 @@ class MatmulBenchmarkTest(googletest.TestCase):
   def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     graph = ops.Graph()
     with graph.as_default():
-      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b,
-                                   dtype)
+      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k,
+                                   transpose_a, transpose_b, dtype)
       gd = graph.as_graph_def()
-      dev=googletest.gpu_device_name()
+      dev = googletest.gpu_device_name()
       proto_expected = """
-      node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" }
-      node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" }
-      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" }
-      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "Variable" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \"""" + dev + """\" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "Variable_1" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \"""" + dev + """\" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \"""" + dev + """\" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \"""" + dev + """\" }
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \"""" + dev + """\" }
                        """
       self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
 
   def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     benchmark_instance = matmul_benchmark.MatmulBenchmark()
-    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a,
-                                            transpose_b, 1, dtype)
+    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m,
+                                            k, transpose_a, transpose_b, 1,
+                                            dtype)
     self.assertTrue(duration > 1e-6)
 
 
@@ -113,8 +114,8 @@ if __name__ == "__main__":
   index = 0
   for _dtype in dtypes:
     for _n, _m, (_transpose_a, _transpose_b) in itertools.product(
-        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), (False,
-                                                                       True)]):
+        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False),
+                                       (False, True)]):
       _k = _n
       setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index),
               BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype))
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 0593ed2cfa6..a08b836025d 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -278,7 +278,8 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
         grad_vals = sess.run([grad_x, grad_scale, grad_offset])
-        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format)
+        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
+                                               pop_var, epsilon, data_format)
         grad_internal_vals = sess.run(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 32b14f86b56..a691e281ee7 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -41,15 +41,19 @@ from tensorflow.python.ops.gen_nn_ops import *
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
-
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
 
 # pylint: disable=protected-access
 
 
-def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint: disable=redefined-builtin
-                            strides=None, name=None):
+def _non_atrous_convolution(
+    input,
+    filter,
+    padding,
+    data_format=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    name=None):
   """Computes sums of N-D convolutions (actually cross correlation).
 
   It is required that 1 <= N <= 3.
@@ -94,12 +98,13 @@ def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint:
     input_shape = input.get_shape()
     filter = ops.convert_to_tensor(filter, name="filter")
     filter_shape = filter.get_shape()
-    op = _NonAtrousConvolution(input_shape,
-                               filter_shape=filter_shape,
-                               padding=padding,
-                               data_format=data_format,
-                               strides=strides,
-                               name=scope)
+    op = _NonAtrousConvolution(
+        input_shape,
+        filter_shape=filter_shape,
+        padding=padding,
+        data_format=data_format,
+        strides=strides,
+        name=scope)
     return op(input, filter)
 
 
@@ -119,11 +124,14 @@ class _NonAtrousConvolution(object):
     name: see _non_atrous_convolution.
   """
 
-  def __init__(self,
-               input_shape,
-               filter_shape,  # pylint: disable=redefined-builtin
-               padding, data_format=None,
-               strides=None, name=None):
+  def __init__(
+      self,
+      input_shape,
+      filter_shape,  # pylint: disable=redefined-builtin
+      padding,
+      data_format=None,
+      strides=None,
+      name=None):
     filter_shape = filter_shape.with_rank(input_shape.ndims)
     self.padding = padding
     self.name = name
@@ -137,8 +145,8 @@ class _NonAtrousConvolution(object):
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
-      raise ValueError("len(strides)=%d, but should be %d" %
-                       (len(strides), conv_dims))
+      raise ValueError("len(strides)=%d, but should be %d" % (len(strides),
+                                                              conv_dims))
     if conv_dims == 1:
       # conv1d uses the 2-d data format names
       if data_format is None or data_format == "NWC":
@@ -177,8 +185,14 @@ class _NonAtrousConvolution(object):
   # those for gen_nn_ops.conv2d and gen_nn_ops.conv3d.
   # pylint: disable=redefined-builtin
   def _conv1d(self, input, filter, strides, padding, data_format, name):
-    return conv1d(value=input, filters=filter, stride=strides, padding=padding,
-                  data_format=data_format, name=name)
+    return conv1d(
+        value=input,
+        filters=filter,
+        stride=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
   # pylint: enable=redefined-builtin
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
@@ -340,13 +354,14 @@ def with_space_to_batch(
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
 
-  new_op = _WithSpaceToBatch(input_shape,
-                             dilation_rate,
-                             padding,
-                             build_op,
-                             filter_shape=filter_shape,
-                             spatial_dims=spatial_dims,
-                             data_format=data_format)
+  new_op = _WithSpaceToBatch(
+      input_shape,
+      dilation_rate,
+      padding,
+      build_op,
+      filter_shape=filter_shape,
+      spatial_dims=spatial_dims,
+      data_format=data_format)
   return new_op(input, None)
 
 
@@ -377,9 +392,8 @@ class _WithSpaceToBatch(object):
                spatial_dims=None,
                data_format=None):
     """Helper class for _with_space_to_batch."""
-    dilation_rate = ops.convert_to_tensor(dilation_rate,
-                                          dtypes.int32,
-                                          name="dilation_rate")
+    dilation_rate = ops.convert_to_tensor(
+        dilation_rate, dtypes.int32, name="dilation_rate")
     try:
       rate_shape = dilation_rate.get_shape().with_rank(1)
     except ValueError:
@@ -439,9 +453,7 @@ class _WithSpaceToBatch(object):
       if const_filter_shape is not None:
         filter_shape = const_filter_shape
         self.base_paddings = _with_space_to_batch_base_paddings(
-            const_filter_shape,
-            num_spatial_dims,
-            rate_or_const_rate)
+            const_filter_shape, num_spatial_dims, rate_or_const_rate)
       else:
         self.num_spatial_dims = num_spatial_dims
         self.rate_or_const_rate = rate_or_const_rate
@@ -478,9 +490,7 @@ class _WithSpaceToBatch(object):
       # shape was not fully defined.
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
-          filter_shape,
-          self.num_spatial_dims,
-          self.rate_or_const_rate)
+          filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -491,9 +501,7 @@ class _WithSpaceToBatch(object):
     paddings = _with_space_to_batch_adjust(paddings, 0, spatial_dims)
     crops = _with_space_to_batch_adjust(crops, 0, spatial_dims)
     input_converted = array_ops.space_to_batch_nd(
-        input=inp,
-        block_shape=dilation_rate,
-        paddings=paddings)
+        input=inp, block_shape=dilation_rate, paddings=paddings)
 
     result = self.op(input_converted, filter)
 
@@ -519,17 +527,17 @@ def _with_space_to_batch_base_paddings(filter_shape, num_spatial_dims,
   # Spatial dimensions of the filters and the upsampled filters in which we
   # introduce (rate - 1) zeros between consecutive filter values.
   filter_spatial_shape = filter_shape[:num_spatial_dims]
-  dilated_filter_spatial_shape = (filter_spatial_shape +
-                                  (filter_spatial_shape - 1) *
-                                  (rate_or_const_rate - 1))
+  dilated_filter_spatial_shape = (
+      filter_spatial_shape + (filter_spatial_shape - 1) *
+      (rate_or_const_rate - 1))
   pad_extra_shape = dilated_filter_spatial_shape - 1
 
   # When full_padding_shape is odd, we pad more at end, following the same
   # convention as conv2d.
   pad_extra_start = pad_extra_shape // 2
   pad_extra_end = pad_extra_shape - pad_extra_start
-  base_paddings = array_ops.stack([[pad_extra_start[i], pad_extra_end[i]]
-                                   for i in range(num_spatial_dims)])
+  base_paddings = array_ops.stack(
+      [[pad_extra_start[i], pad_extra_end[i]] for i in range(num_spatial_dims)])
   return base_paddings
 
 
@@ -623,8 +631,8 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   if strides is None:
     strides = [1] * num_spatial_dims
   elif len(strides) != num_spatial_dims:
-    raise ValueError("len(strides)=%d but should be %d" %
-                     (len(strides), num_spatial_dims))
+    raise ValueError("len(strides)=%d but should be %d" % (len(strides),
+                                                           num_spatial_dims))
   strides = np.array(strides, dtype=np.int32)
   if np.any(strides < 1):
     raise ValueError("all values of strides must be positive")
@@ -636,9 +644,14 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 
 @tf_export("nn.convolution")
-def convolution(input, filter,  # pylint: disable=redefined-builtin
-                padding, strides=None, dilation_rate=None,
-                name=None, data_format=None):
+def convolution(
+    input,
+    filter,  # pylint: disable=redefined-builtin
+    padding,
+    strides=None,
+    dilation_rate=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Computes sums of N-D convolutions (actually cross-correlation).
 
@@ -757,12 +770,14 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
     input_shape = input.get_shape()
     filter = ops.convert_to_tensor(filter, name="filter")
     filter_shape = filter.get_shape()
-    op = Convolution(input_shape,
-                     filter_shape,
-                     padding,
-                     strides=strides,
-                     dilation_rate=dilation_rate,
-                     name=name, data_format=data_format)
+    op = Convolution(
+        input_shape,
+        filter_shape,
+        padding,
+        strides=strides,
+        dilation_rate=dilation_rate,
+        name=name,
+        data_format=data_format)
     return op(input, filter)
 
 
@@ -786,8 +801,11 @@ class Convolution(object):
   def __init__(self,
                input_shape,
                filter_shape,
-               padding, strides=None, dilation_rate=None,
-               name=None, data_format=None):
+               padding,
+               strides=None,
+               dilation_rate=None,
+               name=None,
+               data_format=None):
     """Helper function for convolution."""
     num_total_dims = filter_shape.ndims
     if num_total_dims is None:
@@ -809,17 +827,17 @@ class Convolution(object):
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = input_shape[num_spatial_dims + 1]
-      spatial_dims = range(1, num_spatial_dims+1)
+      spatial_dims = range(1, num_spatial_dims + 1)
     else:
       input_channels_dim = input_shape[1]
-      spatial_dims = range(2, num_spatial_dims+2)
+      spatial_dims = range(2, num_spatial_dims + 2)
 
-    if not input_channels_dim.is_compatible_with(filter_shape[
-        num_spatial_dims]):
+    if not input_channels_dim.is_compatible_with(
+        filter_shape[num_spatial_dims]):
       raise ValueError(
           "number of input channels does not match corresponding dimension of "
-          "filter, {} != {}".format(input_channels_dim, filter_shape[
-              num_spatial_dims]))
+          "filter, {} != {}".format(input_channels_dim,
+                                    filter_shape[num_spatial_dims]))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -852,14 +870,15 @@ class Convolution(object):
 
 
 @tf_export("nn.pool")
-def pool(input,  # pylint: disable=redefined-builtin
-         window_shape,
-         pooling_type,
-         padding,
-         dilation_rate=None,
-         strides=None,
-         name=None,
-         data_format=None):
+def pool(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    padding,
+    dilation_rate=None,
+    strides=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Performs an N-D pooling operation.
 
@@ -941,8 +960,8 @@ def pool(input,  # pylint: disable=redefined-builtin
 
   """
   # pylint: enable=line-too-long
-  with ops.name_scope(name, "%s_pool" %
-                      (pooling_type.lower()), [input]) as scope:
+  with ops.name_scope(name, "%s_pool" % (pooling_type.lower()),
+                      [input]) as scope:
     input = ops.convert_to_tensor(input, name="input")
 
     num_spatial_dims = len(window_shape)
@@ -963,17 +982,18 @@ def pool(input,  # pylint: disable=redefined-builtin
           "strides > window_shape not supported due to inconsistency between "
           "CPU and GPU implementations")
 
-    pooling_ops = {("MAX", 1): max_pool,
-                   ("MAX", 2): max_pool,
-                   ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
-                   ("AVG", 1): avg_pool,
-                   ("AVG", 2): avg_pool,
-                   ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
-                  }
+    pooling_ops = {
+        ("MAX", 1): max_pool,
+        ("MAX", 2): max_pool,
+        ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
+        ("AVG", 1): avg_pool,
+        ("AVG", 2): avg_pool,
+        ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
+    }
     op_key = (pooling_type, num_spatial_dims)
     if op_key not in pooling_ops:
-      raise ValueError("%d-D %s pooling is not supported." %
-                       (op_key[1], op_key[0]))
+      raise ValueError("%d-D %s pooling is not supported." % (op_key[1],
+                                                              op_key[0]))
 
     if data_format is None or not data_format.startswith("NC"):
       adjusted_window_shape = [1] + list(window_shape) + [1]
@@ -1000,12 +1020,13 @@ def pool(input,  # pylint: disable=redefined-builtin
       if num_spatial_dims == 1:
         converted_input = array_ops.expand_dims(converted_input,
                                                 spatial_dims[0])
-      result = pooling_ops[op_key](converted_input,
-                                   adjusted_window_shape,
-                                   adjusted_strides,
-                                   converted_padding,
-                                   name=scope,
-                                   **data_format_kwargs)
+      result = pooling_ops[op_key](
+          converted_input,
+          adjusted_window_shape,
+          adjusted_strides,
+          converted_padding,
+          name=scope,
+          **data_format_kwargs)
       if num_spatial_dims == 1:
         result = array_ops.squeeze(result, [spatial_dims[0]])
       return result
@@ -1021,7 +1042,9 @@ def pool(input,  # pylint: disable=redefined-builtin
 
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
-  """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
+  """Atrous convolution (a.k.a.
+
+  convolution with holes or dilated convolution).
 
   This function is a simpler wrapper around the more general
   @{tf.nn.convolution}, and exists only for backwards compatibility. You can
@@ -1065,7 +1088,8 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
   that effectively use atrous convolution in different ways are, among others,
   [OverFeat: Integrated Recognition, Localization and Detection using
   Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-  Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+  Scanning with Deep Max-Pooling Convolutional Neural
+  Networks](http://arxiv.org/abs/1302.1700).
   Atrous convolution is also closely related to the so-called noble identities
   in multi-rate signal processing.
 
@@ -1156,13 +1180,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
 
 
 @tf_export("nn.conv2d_transpose")
-def conv2d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NHWC",
-                     name=None):
+def conv2d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
   """The transpose of `conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1207,15 +1232,16 @@ def conv2d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis], filter.get_shape()[2]))
+            "{} != {}".format(output_shape[axis],
+                              filter.get_shape()[2]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
@@ -1281,29 +1307,32 @@ def atrous_conv2d_transpose(value,
     if not value.get_shape()[3].is_compatible_with(filters.get_shape()[3]):
       raise ValueError(
           "value's input channels does not match filters' input channels, "
-          "{} != {}".format(value.get_shape()[3], filters.get_shape()[3]))
+          "{} != {}".format(value.get_shape()[3],
+                            filters.get_shape()[3]))
     if rate < 1:
       raise ValueError("rate {} cannot be less than one".format(rate))
 
     if rate == 1:
-      return conv2d_transpose(value,
-                              filters,
-                              output_shape,
-                              strides=[1, 1, 1, 1],
-                              padding=padding,
-                              data_format="NHWC")
+      return conv2d_transpose(
+          value,
+          filters,
+          output_shape,
+          strides=[1, 1, 1, 1],
+          padding=padding,
+          data_format="NHWC")
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filters.get_shape()[2].is_compatible_with(output_shape[3]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[3], filters.get_shape()[2]))
+            "{} != {}".format(output_shape[3],
+                              filters.get_shape()[2]))
 
     # We have two padding contributions. The first is used for converting "SAME"
     # to "VALID". The second is required so that the height and width of the
@@ -1352,14 +1381,13 @@ def atrous_conv2d_transpose(value,
     # component.
     space_to_batch_pad = [[0, pad_bottom_extra], [0, pad_right_extra]]
 
-    value = array_ops.space_to_batch(input=value,
-                                     paddings=space_to_batch_pad,
-                                     block_size=rate)
+    value = array_ops.space_to_batch(
+        input=value, paddings=space_to_batch_pad, block_size=rate)
 
-    input_sizes = [rate * rate * output_shape[0],
-                   (in_height + pad_bottom_extra) // rate,
-                   (in_width + pad_right_extra) // rate,
-                   output_shape[3]]
+    input_sizes = [
+        rate * rate * output_shape[0], (in_height + pad_bottom_extra) // rate,
+        (in_width + pad_right_extra) // rate, output_shape[3]
+    ]
 
     value = gen_nn_ops.conv2d_backprop_input(
         input_sizes=input_sizes,
@@ -1373,19 +1401,19 @@ def atrous_conv2d_transpose(value,
     batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra],
                            [pad_left, pad_right + pad_right_extra]]
 
-    return array_ops.batch_to_space(input=value,
-                                    crops=batch_to_space_crop,
-                                    block_size=rate)
+    return array_ops.batch_to_space(
+        input=value, crops=batch_to_space_crop, block_size=rate)
 
 
 @tf_export("nn.conv3d_transpose")
-def conv3d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NDHWC",
-                     name=None):
+def conv3d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
   """The transpose of `conv3d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1428,27 +1456,29 @@ def conv3d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
-      raise ValueError("output_shape must have shape (5,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (5,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
       if not filter.get_shape()[3].is_compatible_with(output_shape[4]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[4], filter.get_shape()[3]))
+            "{} != {}".format(output_shape[4],
+                              filter.get_shape()[3]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
                        " {}".format(padding))
 
-    return gen_nn_ops.conv3d_backprop_input_v2(input_sizes=output_shape_,
-                                               filter=filter,
-                                               out_backprop=value,
-                                               strides=strides,
-                                               padding=padding,
-                                               data_format=data_format,
-                                               name=name)
+    return gen_nn_ops.conv3d_backprop_input_v2(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 # pylint: disable=protected-access
@@ -1514,7 +1544,9 @@ def crelu(features, name=None, axis=-1):
   Concatenates a ReLU which selects only the positive part of the activation
   with a ReLU which selects only the *negative* part of the activation.
   Note that as a result this non-linearity doubles the depth of the activations.
-  Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201)
+  Source: [Understanding and Improving Convolutional Neural Networks via
+  Concatenated Rectified Linear Units. W. Shang, et
+  al.](https://arxiv.org/abs/1603.05201)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1534,7 +1566,9 @@ def crelu(features, name=None, axis=-1):
 @tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
-  Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
+
+  Source: [Convolutional Deep Belief Networks on CIFAR-10. A.
+  Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1622,14 +1656,16 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
       dimension of `logits`.
   """
+
   def _swap_axis(logits, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
-    return array_ops.transpose(logits,
-                               array_ops.concat([
-                                   math_ops.range(dim_index), [last_index],
-                                   math_ops.range(dim_index + 1, last_index),
-                                   [dim_index]
-                               ], 0), name=name)
+    return array_ops.transpose(
+        logits,
+        array_ops.concat([
+            math_ops.range(dim_index), [last_index],
+            math_ops.range(dim_index + 1, last_index), [dim_index]
+        ], 0),
+        name=name)
 
   logits = ops.convert_to_tensor(logits)
 
@@ -1746,9 +1782,12 @@ def _ensure_xent_args(name, sentinel, labels, logits):
 
 
 @tf_export("nn.softmax_cross_entropy_with_logits_v2")
-def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=invalid-name
-                                         labels=None, logits=None,
-                                         dim=-1, name=None):
+def softmax_cross_entropy_with_logits_v2(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1790,19 +1829,19 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits",
+                      [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        logits.dtype == dtypes.float16) else logits
+    precise_logits = math_ops.cast(
+        logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits
     # labels and logits must be of the same type
     labels = math_ops.cast(labels, precise_logits.dtype)
     input_rank = array_ops.rank(precise_logits)
@@ -1811,13 +1850,14 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
 
     # Move the dim to the end if dim is not the last dimension.
     if dim is not -1:
+
       def _move_dim_to_end(tensor, dim_index, rank):
-        return array_ops.transpose(tensor,
-                                   array_ops.concat([
-                                       math_ops.range(dim_index),
-                                       math_ops.range(dim_index + 1, rank),
-                                       [dim_index]
-                                   ], 0))
+        return array_ops.transpose(
+            tensor,
+            array_ops.concat([
+                math_ops.range(dim_index),
+                math_ops.range(dim_index + 1, rank), [dim_index]
+            ], 0))
 
       precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
       labels = _move_dim_to_end(labels, dim, input_rank)
@@ -1862,9 +1902,12 @@ See tf.nn.softmax_cross_entropy_with_logits_v2.
 
 @tf_export("nn.softmax_cross_entropy_with_logits")
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
-def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                      labels=None, logits=None,
-                                      dim=-1, name=None):
+def softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1906,11 +1949,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits_sg", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits_sg",
+                      [logits, labels]) as name:
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
@@ -1918,9 +1961,11 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits")
-def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                             labels=None, logits=None,
-                                             name=None):
+def sparse_softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1976,15 +2021,15 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
                       [labels, logits]):
     labels = ops.convert_to_tensor(labels)
     logits = ops.convert_to_tensor(logits)
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        dtypes.as_dtype(logits.dtype) == dtypes.float16) else logits
+    precise_logits = math_ops.cast(logits, dtypes.float32) if (dtypes.as_dtype(
+        logits.dtype) == dtypes.float16) else logits
 
     # Store label shape for result later.
     labels_static_shape = labels.get_shape()
     labels_shape = array_ops.shape(labels)
     if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
-      raise ValueError("Logits cannot be scalars - received shape %s." %
-                       logits.get_shape())
+      raise ValueError(
+          "Logits cannot be scalars - received shape %s." % logits.get_shape())
     if logits.get_shape().ndims is not None and (
         labels_static_shape.ndims is not None and
         labels_static_shape.ndims != logits.get_shape().ndims - 1):
@@ -2041,12 +2086,13 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "AvgPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._avg_pool(value,
-                                ksize=ksize,
-                                strides=strides,
-                                padding=padding,
-                                data_format=data_format,
-                                name=name)
+    return gen_nn_ops._avg_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 @tf_export("nn.max_pool")
@@ -2083,8 +2129,8 @@ def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2092,8 +2138,9 @@ def _calc_conv_flops(graph, node):
   filter_width = int(filter_shape[1])
   filter_in_depth = int(filter_shape[2])
   output_count = np.prod(output_shape.as_list())
-  return ops.OpStats("flops", (output_count * filter_in_depth * filter_height *
-                               filter_width * 2))
+  return ops.OpStats(
+      "flops",
+      (output_count * filter_in_depth * filter_height * filter_width * 2))
 
 
 @ops.RegisterStatistics("DepthwiseConv2dNative", "flops")
@@ -2101,8 +2148,8 @@ def _calc_depthwise_conv_flops(graph, node):
   """Calculates the compute resources needed for DepthwiseConv2dNative."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2210,9 +2257,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
       raise ValueError("keep_prob must be a scalar tensor or a float in the "
                        "range (0, 1], got %g" % keep_prob)
-    keep_prob = ops.convert_to_tensor(keep_prob,
-                                      dtype=x.dtype,
-                                      name="keep_prob")
+    keep_prob = ops.convert_to_tensor(
+        keep_prob, dtype=x.dtype, name="keep_prob")
     keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
     # Do nothing if we know keep_prob == 1
@@ -2222,9 +2268,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x)
     # uniform [keep_prob, 1.0 + keep_prob)
     random_tensor = keep_prob
-    random_tensor += random_ops.random_uniform(noise_shape,
-                                               seed=seed,
-                                               dtype=x.dtype)
+    random_tensor += random_ops.random_uniform(
+        noise_shape, seed=seed, dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
     ret = math_ops.div(x, keep_prob) * binary_tensor
@@ -2293,13 +2338,21 @@ def nth_element(input, n, reverse=False, name=None):
 
 @tf_export("nn.conv1d")
 @deprecation.deprecated_arg_values(
-    None, "`NCHW` for data_format is deprecated, use `NCW` instead",
-    warn_once=True, data_format="NCHW")
+    None,
+    "`NCHW` for data_format is deprecated, use `NCW` instead",
+    warn_once=True,
+    data_format="NCHW")
 @deprecation.deprecated_arg_values(
-    None, "`NHWC` for data_format is deprecated, use `NWC` instead",
-    warn_once=True, data_format="NHWC")
-def conv1d(value, filters, stride, padding,
-           use_cudnn_on_gpu=None, data_format=None,
+    None,
+    "`NHWC` for data_format is deprecated, use `NWC` instead",
+    warn_once=True,
+    data_format="NHWC")
+def conv1d(value,
+           filters,
+           stride,
+           padding,
+           use_cudnn_on_gpu=None,
+           data_format=None,
            name=None):
   r"""Computes a 1-D convolution given 3-D input and filter tensors.
 
@@ -2358,9 +2411,13 @@ def conv1d(value, filters, stride, padding,
       raise ValueError("data_format must be \"NWC\" or \"NCW\".")
     value = array_ops.expand_dims(value, spatial_start_dim)
     filters = array_ops.expand_dims(filters, 0)
-    result = gen_nn_ops.conv2d(value, filters, strides, padding,
-                               use_cudnn_on_gpu=use_cudnn_on_gpu,
-                               data_format=data_format)
+    result = gen_nn_ops.conv2d(
+        value,
+        filters,
+        strides,
+        padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        data_format=data_format)
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
@@ -2466,8 +2523,8 @@ def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2527,12 +2584,13 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
   with ops.name_scope(name, "erosion2d", [value, kernel]) as name:
     # Reduce erosion to dilation by duality.
     return math_ops.negative(
-        gen_nn_ops.dilation2d(input=math_ops.negative(value),
-                              filter=array_ops.reverse_v2(kernel, [0, 1]),
-                              strides=strides,
-                              rates=rates,
-                              padding=padding,
-                              name=name))
+        gen_nn_ops.dilation2d(
+            input=math_ops.negative(value),
+            filter=array_ops.reverse_v2(kernel, [0, 1]),
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name=name))
 
 
 @tf_export("nn.in_top_k")
@@ -2565,5 +2623,5 @@ def in_top_k(predictions, targets, k, name=None):
   Returns:
     A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
   """
-  with ops.name_scope(name, 'in_top_k'):
+  with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name)
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 5e9e7100270..4ac2a8f634b 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -93,7 +93,8 @@ class Conv2DTest(test.TestCase):
     quantized_range = ((quantized_max - quantized_min) * range_adjust)
     range_scale = (quantized_range / number_of_steps)
     lowest_quantized = -(1 << (number_of_bits - 1))
-    result = np.array([(quantized_min + ((float(x) - lowest_quantized) * range_scale))
+    result = np.array([(quantized_min +
+                        ((float(x) - lowest_quantized) * range_scale))
                        for x in quantized.flatten()])
     return result
 
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index 4bf3b35e138..d590bc4be6d 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -34,7 +34,10 @@ class QuantizedOpsTest(test.TestCase):
   def testQuantizeOp(self):
     expected_output = [1, 1, 2, 127, 255, 255]
     with self.test_session(use_gpu=False) as sess:
-      x = constant_op.constant([1.0, 1.25, 1.75, 127.0, 255.0, 500.0], shape=[6], dtype=dtypes.float32)
+      x = constant_op.constant(
+          [1.0, 1.25, 1.75, 127.0, 255.0, 500.0],
+          shape=[6],
+          dtype=dtypes.float32)
       x_min = 0.0
       x_max = 255.0
       op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index bdf41cd75d6..cc9f7981e41 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -348,9 +348,9 @@ class ResourceVariable(variables.Variable):
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     self._save_slice_info = None
-    # Save the graph's container prefix for error checking. Reading the value of
-    # the ResourceVariable from another Graph in Eager mode is an error.
-    self._container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+    # Store the graph key so optimizers know how to only retrieve variables from
+    # this graph.
+    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     with ops.init_scope():
       self._in_graph_mode = context.in_graph_mode()
       with ops.name_scope(name, "Variable", []
@@ -662,15 +662,7 @@ class ResourceVariable(variables.Variable):
 
     Returns:
      the read operation.
-    Raises:
-      ValueError: if the ResourceVariable was created in another isolation
-        environment or graph.
     """
-    if (not self._in_graph_mode and
-        self._container_prefix != ops.get_default_graph()._container_prefix):  # pylint: disable=protected-access
-      raise ValueError(
-          "Attempted to read a variable from another isolation environment"
-          " or Graph")
     with ops.name_scope("Read"):
       # Ensure we read the variable in the same device as the handle.
       with ops.device(self._handle_device):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index a10e1963d1f..da80e72071c 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -171,11 +171,11 @@ def _rnn_step(
   return (final_output, final_state)
 
   Args:
-    time: Python int, the current time step
-    sequence_length: int32 `Tensor` vector of size [batch_size]
-    min_sequence_length: int32 `Tensor` scalar, min of sequence_length
-    max_sequence_length: int32 `Tensor` scalar, max of sequence_length
-    zero_output: `Tensor` vector of shape [output_size]
+    time: int32 `Tensor` scalar.
+    sequence_length: int32 `Tensor` vector of size [batch_size].
+    min_sequence_length: int32 `Tensor` scalar, min of sequence_length.
+    max_sequence_length: int32 `Tensor` scalar, max of sequence_length.
+    zero_output: `Tensor` vector of shape [output_size].
     state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
       or a list/tuple of such tensors.
     call_cell: lambda returning tuple of (new_output, new_state) where
@@ -202,6 +202,9 @@ def _rnn_step(
   flat_state = nest.flatten(state)
   flat_zero_output = nest.flatten(zero_output)
 
+  # Vector describing which batch entries are finished.
+  copy_cond = time >= sequence_length
+
   def _copy_one_through(output, new_output):
     # TensorArray and scalar get passed through.
     if isinstance(output, tensor_array_ops.TensorArray):
@@ -209,7 +212,6 @@ def _rnn_step(
     if output.shape.ndims == 0:
       return new_output
     # Otherwise propagate the old or the new value.
-    copy_cond = (time >= sequence_length)
     with ops.colocate_with(new_output):
       return array_ops.where(copy_cond, output, new_output)
 
@@ -1125,6 +1127,12 @@ def raw_rnn(cell, loop_fn,
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
         def copy_fn(cur_i, cand_i):
+          # TensorArray and scalar get passed through.
+          if isinstance(cur_i, tensor_array_ops.TensorArray):
+            return cand_i
+          if cur_i.shape.ndims == 0:
+            return cand_i
+          # Otherwise propagate the old or the new value.
           with ops.colocate_with(cand_i):
             return array_ops.where(elements_finished, cur_i, cand_i)
         return nest.map_structure(copy_fn, current, candidate)
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 4b5072fd679..1b9071ee93c 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -50,19 +50,21 @@ class EagerFunc(object):
     self._func = func
     self._out_dtypes = Tout
 
-  def __call__(self, *args, **kwargs):
-    """Passes args, kwargs to `self._func`, which is executed eagerly."""
+  def __call__(self, on_gpu, args):
+    """Passes `args` to `self._func`, which is executed eagerly."""
     with context.eager_mode():
-      ret = self._func(*args, **kwargs)
+      ret = self._func(*args)
+      maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
       if isinstance(ret, (tuple, list)):
         return [
-            ops.convert_to_tensor(x, dtype=dtype)
+            maybe_copy_to_gpu(ops.convert_to_tensor(x, dtype=dtype))
             for (x, dtype) in zip(ret, self._out_dtypes)
         ]
       elif ret is None:
         return ret
       else:
-        return ops.convert_to_tensor(ret, dtype=self._out_dtypes[0])
+        return maybe_copy_to_gpu(
+            ops.convert_to_tensor(ret, dtype=self._out_dtypes[0]))
 
 
 class FuncRegistry(object):
@@ -116,16 +118,29 @@ class FuncRegistry(object):
     else:
       return result
 
-  def __call__(self, token, args):
-    """Calls the registered function for `token` with args."""
+  def __call__(self, token, on_gpu, args):
+    """Calls the registered function for `token` with args.
+
+    Args:
+      token: A key into this `FuncRegistry` identifying which function to call.
+      on_gpu: A boolean indicating whether or not `token`'s corresponding
+        operation was placed on GPU; only used if the function registered for
+        `token` is an `EagerPyFunc`.
+      args: The arguments to pass to the function registered for `token`.
+
+    Returns:
+      The output of the function registered for `token`.
+
+    Raises:
+      ValueError: if no function is registered for `token`.
+    """
     func = self._funcs[token]
     if func is None:
       raise ValueError("callback %s is not found" % token)
-    ret = func(*args)
-
     if isinstance(func, EagerFunc):
-      return ret
+      return func(on_gpu, args)
     else:
+      ret = func(*args)
       # Strings seem to lead to a memory leak here if they're not wrapped in a
       # list.
       if isinstance(ret, six.binary_type):
@@ -302,8 +317,5 @@ def py_func(func, inp, Tout, stateful=True, name=None):
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
 
-# TODO(akshayka): PyFuncs where the 'eager' attribute is set to True should be
-# differentiable, i.e., the gradient of PyFunc should propagate Nones if the
-# eager attribute is not set, and otherwise, it should return the gradient.
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 3224856d7be..0fbbf5a805f 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -227,13 +227,14 @@ def sparse_concat(axis,
             [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0)
     shapes = [
         array_ops.concat([
-            max_shape[:axis], shape[-1:] if axis == -1 else
-            shape[axis:axis + 1], [] if axis == -1 else max_shape[axis + 1:]
+            max_shape[:axis], shape[-1:]
+            if axis == -1 else shape[axis:axis + 1], []
+            if axis == -1 else max_shape[axis + 1:]
         ], 0) for shape in shapes
     ]
 
-  output_ind, output_val, output_shape = (gen_sparse_ops._sparse_concat(
-      inds, vals, shapes, axis, name=name))
+  output_ind, output_val, output_shape = (
+      gen_sparse_ops._sparse_concat(inds, vals, shapes, axis, name=name))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -300,15 +301,14 @@ def sparse_add(a, b, thresh=0):
     b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
-    output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
-        a.indices, a.values, a.dense_shape,
-        b.indices, b.values, b.dense_shape,
-        thresh))
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops._sparse_add(a.indices, a.values, a.dense_shape,
+                                   b.indices, b.values, b.dense_shape, thresh))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
-    static_shape = array_ops.broadcast_static_shape(
-        a.get_shape(), b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(a.get_shape(),
+                                                    b.get_shape())
     if static_shape.is_fully_defined():
       output_shape = static_shape.as_list()
 
@@ -317,8 +317,8 @@ def sparse_add(a, b, thresh=0):
     # swap to make `a` the SparseTensor.
     if isinstance(b, sparse_classes):
       a, b = b, a
-    return gen_sparse_ops._sparse_tensor_dense_add(
-        a.indices, a.values, a.dense_shape, b)
+    return gen_sparse_ops._sparse_tensor_dense_add(a.indices, a.values,
+                                                   a.dense_shape, b)
 
 
 def _sparse_cross(inputs, name=None):
@@ -397,19 +397,25 @@ def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
-def _sparse_cross_internal(
-    inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None):
+def _sparse_cross_internal(inputs,
+                           hashed_output=False,
+                           num_buckets=0,
+                           hash_key=None,
+                           name=None):
   """See gen_sparse_ops._sparse_cross."""
   if not isinstance(inputs, list):
     raise TypeError("Inputs must be a list")
-  if not all(isinstance(i, sparse_tensor.SparseTensor) or
-             isinstance(i, ops.Tensor) for i in inputs):
+  if not all(
+      isinstance(i, sparse_tensor.SparseTensor) or isinstance(i, ops.Tensor)
+      for i in inputs):
     raise TypeError("All inputs must be SparseTensors")
 
-  sparse_inputs = [i for i in inputs
-                   if isinstance(i, sparse_tensor.SparseTensor)]
-  dense_inputs = [i for i in inputs
-                  if not isinstance(i, sparse_tensor.SparseTensor)]
+  sparse_inputs = [
+      i for i in inputs if isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  dense_inputs = [
+      i for i in inputs if not isinstance(i, sparse_tensor.SparseTensor)
+  ]
 
   indices = [sp_input.indices for sp_input in sparse_inputs]
   values = [sp_input.values for sp_input in sparse_inputs]
@@ -504,8 +510,9 @@ def sparse_reorder(sp_input, name=None):
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
+  reordered_ind, reordered_val = (
+      gen_sparse_ops._sparse_reorder(
+          sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
   if sp_input.get_shape().is_fully_defined():
     dense_shape = sp_input.get_shape().as_list()
@@ -572,8 +579,8 @@ def sparse_reshape(sp_input, shape, name=None):
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
     reshaped_shape_const = tensor_util.constant_value(shape)
-    if (reshaped_shape_const is not None
-        and sp_input.get_shape().is_fully_defined()):
+    if (reshaped_shape_const is not None and
+        sp_input.get_shape().is_fully_defined()):
       num_implied = sum((dim == -1) for dim in reshaped_shape_const)
       if num_implied > 1:
         raise ValueError("At most one dimension can be inferred (-1). Found: %s"
@@ -589,15 +596,15 @@ def sparse_reshape(sp_input, shape, name=None):
             in_shape_size // np.prod(non_implied_idx))
       reshaped_size = np.prod(reshaped_shape_const)
       if reshaped_size != in_shape_size:
-        raise ValueError(
-            "Cannot reshape a tensor with %d elements to shape %s "
-            "(%d elements)."
-            % (in_shape_size, original_reshaped_shape, reshaped_size))
+        raise ValueError("Cannot reshape a tensor with %d elements to shape %s "
+                         "(%d elements)." %
+                         (in_shape_size, original_reshaped_shape,
+                          reshaped_size))
       reshaped_shape = reshaped_shape_const
 
-    return sparse_tensor.SparseTensor(
-        reshaped_ind, array_ops.identity(sp_input.values),
-        reshaped_shape)
+    return sparse_tensor.SparseTensor(reshaped_ind,
+                                      array_ops.identity(sp_input.values),
+                                      reshaped_shape)
 
 
 # TODO(aselle): Remove keyword required once for 1.0 final
@@ -610,8 +617,11 @@ class KeywordRequired(object):
 
 @tf_export("sparse_split")
 def sparse_split(keyword_required=KeywordRequired(),
-                 sp_input=None, num_split=None, axis=None,
-                 name=None, split_dim=None):
+                 sp_input=None,
+                 num_split=None,
+                 axis=None,
+                 name=None,
+                 split_dim=None):
   """Split a `SparseTensor` into `num_split` tensors along `axis`.
 
   If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
@@ -660,18 +670,19 @@ def sparse_split(keyword_required=KeywordRequired(),
                                                 split_dim)
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  output_inds, output_vals, output_shapes = (gen_sparse_ops._sparse_split(
-      axis,
-      sp_input.indices,
-      sp_input.values,
-      sp_input.dense_shape,
-      num_split,
-      name=name))
+  output_inds, output_vals, output_shapes = (
+      gen_sparse_ops._sparse_split(
+          axis,
+          sp_input.indices,
+          sp_input.values,
+          sp_input.dense_shape,
+          num_split,
+          name=name))
   sparse_tensors = []
   for i in range(0, num_split):
     sparse_tensors.append(
-        sparse_tensor.SparseTensor(
-            output_inds[i], output_vals[i], output_shapes[i]))
+        sparse_tensor.SparseTensor(output_inds[i], output_vals[i],
+                                   output_shapes[i]))
   return sparse_tensors
 
 
@@ -713,12 +724,15 @@ def sparse_slice(sp_input, start, size, name=None):
 
   with ops.name_scope(name, "SparseSlice", [sp_input]) as name:
     output_indices, output_values, output_shape = gen_sparse_ops.sparse_slice(
-        sp_input.indices, sp_input.values, sp_input.dense_shape, start, size, name=name)
+        sp_input.indices,
+        sp_input.values,
+        sp_input.dense_shape,
+        start,
+        size,
+        name=name)
 
-    return sparse_tensor.SparseTensor(
-        output_indices,
-        output_values,
-        output_shape)
+    return sparse_tensor.SparseTensor(output_indices, output_values,
+                                      output_shape)
 
 
 @tf_export("sparse_to_dense")
@@ -819,14 +833,14 @@ def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_max(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
 @tf_export("sparse_reduce_max_sparse")
-def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
+def sparse_reduce_max_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the max of elements across dimensions of a SparseTensor.
 
@@ -855,10 +869,8 @@ def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_max_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -905,14 +917,14 @@ def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_sum(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
 @tf_export("sparse_reduce_sum_sparse")
-def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
+def sparse_reduce_sum_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
 
@@ -941,10 +953,8 @@ def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_sum_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
@@ -1053,8 +1063,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
   with ops.name_scope(name, "SparseToIndicator", [sp_input]) as name:
     num_entries = array_ops.shape(sp_input.indices)[0]
     new_values = array_ops.fill(array_ops.expand_dims(num_entries, 0), True)
-    sp_values = sparse_tensor.SparseTensor(
-        sp_input.indices, new_values, sp_input.dense_shape)
+    sp_values = sparse_tensor.SparseTensor(sp_input.indices, new_values,
+                                           sp_input.dense_shape)
 
     sp_new = sparse_merge(sp_input, sp_values, vocab_size, name)
 
@@ -1174,8 +1184,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
       raise TypeError("vocab_size has to be a list of Tensors or Python ints. "
                       "Found %s" % type(vocab_size))
     for dim in vocab_size:
-      if not (isinstance(dim, ops.Tensor) or
-              isinstance(dim, numbers.Integral)):
+      if not (isinstance(dim, ops.Tensor) or isinstance(dim, numbers.Integral)):
         raise TypeError(
             "vocab_size has to be a list of Tensors or Python ints. Found %s" %
             type(dim))
@@ -1326,24 +1335,23 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    output_shape_tensor_const = tensor_util.constant_value(
-        output_shape_tensor)
+    output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor)
     # For cases where all shapes are known during graph construction
-    if (output_shape_tensor_const is not None
-        and sp_input.get_shape().is_fully_defined()):
+    if (output_shape_tensor_const is not None and
+        sp_input.get_shape().is_fully_defined()):
       in_shape_const = np.array(sp_input.get_shape().as_list())
       if not np.all(in_shape_const <= output_shape_tensor_const):
         raise ValueError(
             "Requested new_shape should have dimension sizes >= sp_input.shape."
-            "  Found new_shape (%s), sp_input.shape (%s)."
-            % (in_shape_const, output_shape_tensor_const))
+            "  Found new_shape (%s), sp_input.shape (%s)." %
+            (in_shape_const, output_shape_tensor_const))
       output_shape_tensor = output_shape_tensor_const
     else:
       # For cases where shape is not known during graph construction.
-      output_shape_tensor = control_flow_ops.with_dependencies(
-          [check_ops.assert_equal(
-              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies([
+          check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))
+      ], output_shape_tensor)
       output_shape_tensor = control_flow_ops.with_dependencies(
           [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
           output_shape_tensor)
@@ -1409,10 +1417,10 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
          values=sp_input.values,
          dense_shape=sp_input.dense_shape,
          default_value=default_value)
-    return (sparse_tensor.SparseTensor(indices=output_indices,
-                                       values=output_values,
-                                       dense_shape=sp_input.dense_shape),
-            empty_row_indicator)
+    return (sparse_tensor.SparseTensor(
+        indices=output_indices,
+        values=output_values,
+        dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
 @tf_export("serialize_sparse")
@@ -1880,8 +1888,8 @@ def sparse_softmax(sp_input, name=None):
                       [sp_input.indices, sp_input.values]) as name:
     out_vals = gen_sparse_ops.sparse_softmax(sp_input.indices, sp_input.values,
                                              sp_input.dense_shape)
-    return sparse_tensor.SparseTensor(
-        sp_input.indices, out_vals, sp_input.dense_shape)
+    return sparse_tensor.SparseTensor(sp_input.indices, out_vals,
+                                      sp_input.dense_shape)
 
 
 @tf_export("sparse_maximum")
@@ -1907,9 +1915,9 @@ def sparse_maximum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMaximum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMaximum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_maximum(
         sp_a.indices,
         sp_a.values,
@@ -1944,9 +1952,9 @@ def sparse_minimum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMinimum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMinimum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_minimum(
         sp_a.indices,
         sp_a.values,
@@ -2010,14 +2018,15 @@ def sparse_transpose(sp_input, perm=None, name=None):
       dense_shape = sp_input.dense_shape
       transposed_dense_shape = array_ops.gather(dense_shape, perm)
     transposed_st = sparse_tensor.SparseTensor(
-        transposed_indices, sp_input.values,
-        transposed_dense_shape)
+        transposed_indices, sp_input.values, transposed_dense_shape)
     transposed_st = sparse_reorder(transposed_st)
     return transposed_st
 
 
-def _add_sparse_to_tensors_map(sp_input, container=None,
-                               shared_name=None, name=None):
+def _add_sparse_to_tensors_map(sp_input,
+                               container=None,
+                               shared_name=None,
+                               name=None):
   """Add a `SparseTensor` to a `SparseTensorsMap` and return its handle.
 
   Args:
@@ -2038,12 +2047,18 @@ def _add_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _add_many_sparse_to_tensors_map(sp_input, container=None,
-                                    shared_name=None, name=None):
+def _add_many_sparse_to_tensors_map(sp_input,
+                                    container=None,
+                                    shared_name=None,
+                                    name=None):
   """Add a minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 
   The `SparseTensor` must have rank `R` greater than 1, and the first dimension
@@ -2072,12 +2087,18 @@ def _add_many_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_many_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _take_many_sparse_from_tensors_map(
-    sparse_map_op, sparse_handles, rank=None, name=None):
+def _take_many_sparse_from_tensors_map(sparse_map_op,
+                                       sparse_handles,
+                                       rank=None,
+                                       name=None):
   """Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 
   The input `sparse_handles` must be a string matrix of shape `[N, 1]` where
@@ -2140,16 +2161,18 @@ def _take_many_sparse_from_tensors_map(
     raise TypeError("sparse_map_op be an Operation")
   if sparse_map_op.type not in ("AddSparseToTensorsMap",
                                 "AddManySparseToTensorsMap"):
-    raise TypeError("sparse_map_op must be one of AddSparseToTensorsMap or "
-                    "AddSparseToTensorsMap. Instead, found `%s`." %
-                    sparse_map_op.type)
+    raise TypeError(
+        "sparse_map_op must be one of AddSparseToTensorsMap or "
+        "AddSparseToTensorsMap. Instead, found `%s`." % sparse_map_op.type)
   with ops.colocate_with(sparse_map_op):
     shared_name = sparse_map_op.get_attr("shared_name") or sparse_map_op.name
     output_indices, output_values, output_shape = (
         gen_sparse_ops._take_many_sparse_from_tensors_map(
-            sparse_handles, dtype=sparse_map_op.get_attr("T"),
+            sparse_handles,
+            dtype=sparse_map_op.get_attr("T"),
             container=sparse_map_op.get_attr("container"),
-            shared_name=shared_name, name=name))
+            shared_name=shared_name,
+            name=name))
 
   # Feed rank data back in, if available
   output_indices.set_shape([None, rank])
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 15127862a4e..6d7eaababcd 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -192,8 +192,8 @@ def einsum(equation, *inputs, **kwargs):
       input_count = sum(1 for s in input_axis_labels if a in s)
       if input_count > 2 and a not in output_axis_labels:
         logging.warn(
-            'Falling back to exponential-space implementation of einsum() because'
-            ' index "%s" is summed over more than two inputs.', a)
+            'Falling back to exponential-space implementation of einsum()'
+            ' because index "%s" is summed over more than two inputs.', a)
         return _exponential_space_einsum(equation, *inputs)
 
     temp = inputs[0]
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 30bf4e4ef1b..737b923415f 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -26,6 +26,7 @@ import sys as _sys
 from tensorflow.python.ops import array_grad
 from tensorflow.python.ops import data_flow_grad
 from tensorflow.python.ops import math_grad
+from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
@@ -59,6 +60,7 @@ from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
 from tensorflow.python.ops.lookup_ops import initialize_all_tables
 from tensorflow.python.ops.lookup_ops import tables_initializer
+from tensorflow.python.ops.manip_ops import *
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -105,6 +107,7 @@ from tensorflow.python.ops import init_ops as _init_ops
 from tensorflow.python.ops import io_ops as _io_ops
 from tensorflow.python.ops import linalg_ops as _linalg_ops
 from tensorflow.python.ops import logging_ops as _logging_ops
+from tensorflow.python.ops import manip_ops as _manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops import numerics as _numerics
 from tensorflow.python.ops import parsing_ops as _parsing_ops
@@ -280,6 +283,7 @@ remove_undocumented(__name__, _allowed_symbols,
                      _io_ops,
                      _linalg_ops,
                      _logging_ops,
+                     _manip_ops,
                      _math_ops,
                      _numerics,
                      _parsing_ops,
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 3cc76fdbf34..f00213eb88d 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -278,7 +278,7 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
     return gen_state_ops.assign(
         ref, value, use_locking=use_locking, name=name,
         validate_shape=validate_shape)
-  return ref.assign(value)
+  return ref.assign(value, name=name)
 
 
 @tf_export("count_up_to")
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 84449e00beb..806fdd3da7a 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -140,7 +140,7 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
     re-enter the scope and reuse those variables.
 
   Raises:
-    ValueError: if the name is None.
+    ValueError: if `name_` is None.
   """
   return make_template_internal(
       name_,
@@ -176,16 +176,14 @@ def make_template_internal(name_,
     custom_getter_: Optional custom getter for variables used in `func_`. See
       the @{tf.get_variable} `custom_getter` documentation for
       more information.
-    create_graph_function_: When True, the first invocation of the template will
-      execute `func_` as is, to allow for variable creation; however, the second
-      invocation and every invocation thereafter will execute func as a graph
-      function.  In particular, this implies that `func_` must satisfy the
-      properties that `function.defun` requires of functions: See the
-      documentation of `function.defun` for details. When executing eagerly,
-      setting this flag to True can improve performance. Regardless of whether
-      eager execution is enabled, enabling this flag gives the caller access to
-      graph-function semantics, i.e., accesses to variables are totally ordered
-      and side-effecting ops are not pruned.
+    create_graph_function_: When True, `func_` will be executed as a graph
+      function. This implies that `func_` must satisfy the properties that
+      `function.defun` requires of functions: See the documentation of
+      `function.defun` for details. When executing eagerly, setting this flag to
+      True can improve performance. Regardless of whether eager execution is
+      enabled, enabling this flag gives the caller access to graph-function
+      semantics, i.e., accesses to variables are totally ordered and
+      side-effecting ops are not pruned.
     **kwargs: Keyword arguments to apply to `func_`.
 
   Returns:
@@ -198,8 +196,8 @@ def make_template_internal(name_,
     re-enter the scope and reuse those variables.
 
   Raises:
-    ValueError: if the name is None.
-    ValueError: if unique_name_ is not None and eager execution is enabled.
+    ValueError: if `name_` is None.
+    ValueError: if `unique_name_` is not None and eager execution is enabled.
   """
 
   if kwargs:
@@ -266,18 +264,18 @@ class Template(object):
         template of the same scope/unique_name already exists and reuse is
         false, an error is raised. Defaults to None.
       custom_getter: optional custom getter to pass to `variable_scope()`
-      create_graph_function: When True, the first invocation of the template
-        will execute `func` as is, to allow for variable creation; however, the
-        second invocation and every invocation thereafter will execute `func` as
-        a graph function. Enabling this flag gives the caller access to
-        graph-function semantics, i.e., accesses to variables are totally
-        ordered and side-effecting ops are not pruned.
-
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag gives the caller access to graph-function
+        semantics, i.e., accesses to variables are totally ordered and
+        side-effecting ops are not pruned.
 
     Raises:
-      ValueError: if the name is None.
+      ValueError: if `name` is None.
     """
-    self._func = func
+    if create_graph_function:
+      self._func = function.defun(func)
+    else:
+      self._func = func
     self._stacktrace = traceback.format_stack()[:-2]
     self._name = name
     self._unique_name = unique_name
@@ -295,19 +293,13 @@ class Template(object):
     # This variable keeps track of whether the template has been called yet,
     # which is not the same as whether the scope has been created.
     self._variables_created = False
-    self._create_graph_function = create_graph_function
 
   def _call_func(self, args, kwargs):
     try:
       vars_at_start = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
       trainable_at_start = len(
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
       result = self._func(*args, **kwargs)
-      if self._create_graph_function and not self._variables_created:
-        # Only execute self._func as a graph function once variables are
-        # created.
-        self._func = function.defun(self._func)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
@@ -542,14 +534,11 @@ class EagerTemplate(Template):
         names of all created Tensors. If set to False, the scope will be created
         at the first call location.
       custom_getter: optional custom getter to pass to `variable_scope()`
-      create_graph_function: When True, the first invocation of the template
-        will execute `func` as is, to allow for variable creation; however, the
-        second invocation and every invocation thereafter will execute `func` as
-        a graph function. Enabling this flag allows the caller to reap the
-        performance benefits associated with executing graphs, at the cost of
-        sacrificing debuggability; however, not all functions can be compiled
-        into graph functions. See the documentation for `function.defun` for
-        details.
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag allows the caller to reap the performance
+        benefits associated with executing graphs, at the cost of sacrificing
+        debuggability; however, not all Python functions can be compiled into
+        graph functions. See the documentation for `function.defun` for details.
 
     Raises:
       RuntimeError: if eager execution is not enabled.
@@ -573,12 +562,7 @@ class EagerTemplate(Template):
     try:
       vars_at_start = self._template_store.variables()
       trainable_at_start = self._template_store.trainable_variables()
-
       result = self._func(*args, **kwargs)
-      if self._create_graph_function and not self._variables_created:
-        # Only execute self._func as a graph function once variables are
-        # created.
-        self._func = function.defun(self._func)
 
       if self._variables_created:
         # Variables were previously created, implying this is not the first
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 9b92d9a1800..cce64c0ccaf 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -23,6 +23,7 @@ import sys as _sys
 
 from tensorflow.python.platform import flags
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _usage(shorthelp):
@@ -108,6 +109,7 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
+@tf_export('app.run')
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index 2455acb4c0c..8f7b12e2b2b 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -29,8 +29,10 @@ import sys as _sys
 
 from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('resource_loader.load_resource')
 def load_resource(path):
   """Load the resource at given path, where path is relative to tensorflow/.
 
@@ -52,6 +54,7 @@ def load_resource(path):
 
 
 # pylint: disable=protected-access
+@tf_export('resource_loader.get_data_files_path')
 def get_data_files_path():
   """Get a direct path to the data files colocated with the script.
 
@@ -62,6 +65,7 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
+@tf_export('resource_loader.get_root_dir_with_all_resources')
 def get_root_dir_with_all_resources():
   """Get a root directory containing all the data attributes in the build rule.
 
@@ -101,6 +105,7 @@ def get_root_dir_with_all_resources():
   return data_files_dir or script_dir
 
 
+@tf_export('resource_loader.get_path_to_datafile')
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -120,6 +125,7 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
+@tf_export('resource_loader.readahead_file_path')
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
diff --git a/tensorflow/python/platform/stacktrace_handler_test.py b/tensorflow/python/platform/stacktrace_handler_test.py
index 3f0e534f4cb..f2071f9d54c 100644
--- a/tensorflow/python/platform/stacktrace_handler_test.py
+++ b/tensorflow/python/platform/stacktrace_handler_test.py
@@ -57,7 +57,8 @@ class StacktraceHandlerTest(test.TestCase):
 
     # Capture its output. capture both stdout and stderr and append them.
     # We are not worried about timing or order of messages in this test.
-    child_output = child_process.stdout.read() + child_process.stderr.read()
+    child_stdout, child_stderr = child_process.communicate()
+    child_output = child_stdout + child_stderr
 
     # Make sure the child process is dead before we proceed.
     child_process.wait()
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 85ed4f071c7..22aabfd7121 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -35,6 +35,7 @@ import threading
 import six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Don't use this directly. Use _get_logger() instead.
@@ -90,30 +91,37 @@ def _get_logger():
     _logger_lock.release()
 
 
+@tf_export('logging.log')
 def log(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
 
+@tf_export('logging.debug')
 def debug(msg, *args, **kwargs):
   _get_logger().debug(msg, *args, **kwargs)
 
 
+@tf_export('logging.error')
 def error(msg, *args, **kwargs):
   _get_logger().error(msg, *args, **kwargs)
 
 
+@tf_export('logging.fatal')
 def fatal(msg, *args, **kwargs):
   _get_logger().fatal(msg, *args, **kwargs)
 
 
+@tf_export('logging.info')
 def info(msg, *args, **kwargs):
   _get_logger().info(msg, *args, **kwargs)
 
 
+@tf_export('logging.warn')
 def warn(msg, *args, **kwargs):
   _get_logger().warn(msg, *args, **kwargs)
 
 
+@tf_export('logging.warning')
 def warning(msg, *args, **kwargs):
   _get_logger().warning(msg, *args, **kwargs)
 
@@ -136,15 +144,18 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
+@tf_export('logging.TaskLevelStatusMessage')
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
+@tf_export('logging.flush')
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
+@tf_export('logging.vlog')
 def vlog(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
@@ -164,6 +175,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
+@tf_export('logging.log_every_n')
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -180,6 +192,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
+@tf_export('logging.log_first_n')
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -195,6 +208,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
+@tf_export('logging.log_if')
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -251,11 +265,13 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
+@tf_export('logging.get_verbosity')
 def get_verbosity():
   """Return how much logging output will be produced."""
   return _get_logger().getEffectiveLevel()
 
 
+@tf_export('logging.set_verbosity')
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
   _get_logger().setLevel(v)
@@ -296,4 +312,10 @@ _allowed_symbols = [
     'warning',
 ]
 
+tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
+tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
+tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
+tf_export('logging.INFO').export_constant(__name__, 'INFO')
+tf_export('logging.WARN').export_constant(__name__, 'WARN')
+
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 8f780545607..0e20ca35bba 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.profiler import option_builder
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 _DEFAULT_PROFILE_OPTIONS = 0
 _DEFAULT_ADVISE_OPTIONS = 0
@@ -121,6 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
+@tf_export('profiler.Profiler')
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -304,6 +306,7 @@ class Profiler(object):
     print_mdl.WriteProfile(filename)
 
 
+@tf_export('profiler.profile')
 def profile(graph=None,
             run_meta=None,
             op_log=None,
@@ -378,6 +381,7 @@ def profile(graph=None,
   return tfprof_node
 
 
+@tf_export('profiler.advise')
 def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 13942ad6a2a..957ebe6dddc 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('profiler.ProfileOptionBuilder')
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index ffda7ddad75..8d121064967 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import gfile
 from tensorflow.python.profiler.internal import flops_registry  # pylint: disable=unused-import
+from tensorflow.python.util.tf_export import tf_export
 
 TRAINABLE_VARIABLES = '_trainable_variables'
 REGISTERED_FLOP_STATS = 'flops'
@@ -187,6 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
+@tf_export('profiler.write_op_log')
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 5ff954fd9f8..6e85df0cbf5 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -232,13 +232,9 @@ def load(sess, tags, export_dir, **saver_kwargs):
     asset_tensors_dictionary = _get_asset_tensors(export_dir,
                                                   meta_graph_def_to_load)
 
-    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    main_op_tensor = (_get_main_op_tensor(meta_graph_def_to_load) or
+                      (_get_legacy_init_op_tensor(meta_graph_def_to_load)))
     if main_op_tensor is not None:
       sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-    else:
-      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-      if legacy_init_op_tensor is not None:
-        sess.run(
-            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
 
     return meta_graph_def_to_load
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 1ea619ff55d..f92247d52e4 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -54,8 +54,14 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
+@test_util.with_c_api
 class SavedModelTest(test.TestCase):
 
+  def _get_export_dir(self, label):
+    if ops._USE_C_API:
+      label += "_c_api"
+    return os.path.join(test.get_temp_dir(), label)
+
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
     sess.run(variables.global_variables_initializer())
@@ -123,8 +129,7 @@ class SavedModelTest(test.TestCase):
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
 
   def testBadSavedModelFileFormat(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_bad_saved_model_file_format")
+    export_dir = self._get_export_dir("test_bad_saved_model_file_format")
     # Attempt to load a SavedModel from an export directory that does not exist.
     with self.test_session(graph=ops.Graph()) as sess:
       with self.assertRaisesRegexp(IOError,
@@ -157,8 +162,7 @@ class SavedModelTest(test.TestCase):
         loader.load(sess, ["foo"], export_dir)
 
   def testVerifySessionGraphUsage(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_verify_session_graph_usage")
+    export_dir = self._get_export_dir("test_verify_session_graph_usage")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -178,7 +182,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testSequence(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_sequence")
+    export_dir = self._get_export_dir("test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
@@ -195,7 +199,7 @@ class SavedModelTest(test.TestCase):
                         sess, ["baz"])
 
   def testTags(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    export_dir = self._get_export_dir("test_tags")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -284,7 +288,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_variables")
+    export_dir = self._get_export_dir("test_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
@@ -336,7 +340,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testGraphWithoutVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_graph_has_variables")
+    export_dir = self._get_export_dir("test_graph_has_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with no variables.
@@ -371,7 +375,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(30.0, sess.run(c))
 
   def testNoOverwrite(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_no_overwrite")
+    export_dir = self._get_export_dir("test_no_overwrite")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -395,7 +399,7 @@ class SavedModelTest(test.TestCase):
                       export_dir)
 
   def testSaveAsText(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_astext")
+    export_dir = self._get_export_dir("test_astext")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -426,7 +430,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testCollections(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_collections")
+    export_dir = self._get_export_dir("test_collections")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
@@ -476,7 +480,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
   def testSignatureDefs(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_signature_defs")
+    export_dir = self._get_export_dir("test_signature_defs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
@@ -536,8 +540,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual("foo_new", bar_signature["foo_key"].method_name)
 
   def testSignatureDefValidation(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_signature_def_validation")
+    export_dir = self._get_export_dir("test_signature_def_validation")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     tensor_without_name = meta_graph_pb2.TensorInfo()
@@ -555,7 +558,7 @@ class SavedModelTest(test.TestCase):
     self._validate_outputs_tensor_info(builder, tensor_empty)
 
   def testAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_assets")
+    export_dir = self._get_export_dir("test_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -588,7 +591,7 @@ class SavedModelTest(test.TestCase):
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
   def testCustomMainOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_main_op")
+    export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -623,7 +626,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_legacy_init_op")
+    export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -657,8 +660,8 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOpWithNonEmptyCollection(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_legacy_init_op_with_non_empty_collection")
+    export_dir = self._get_export_dir(
+        "test_legacy_init_op_with_non_empty_collection")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -685,7 +688,7 @@ class SavedModelTest(test.TestCase):
             sess, ["foo"], legacy_init_op=legacy_init_op)
 
   def testMultipleAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_multiple_assets")
+    export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -727,7 +730,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testDuplicateAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_duplicate_assets")
+    export_dir = self._get_export_dir("test_duplicate_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -775,7 +778,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_op")
+    export_dir = self._get_export_dir("test_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -818,7 +821,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testCustomSaveable(self):
-    export_dir = os.path.join(test.get_temp_dir(), "custom_saveable")
+    export_dir = self._get_export_dir("custom_saveable")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -847,7 +850,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3.0, v1.values().eval())
 
   def testClearDevices(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_clear_devices")
+    export_dir = self._get_export_dir("test_clear_devices")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
@@ -871,7 +874,9 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testStripDefaultAttrs(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_strip_default_attrs")
+    if ops._USE_C_API: return  # TODO(skyewm): get this working
+
+    export_dir = self._get_export_dir("test_strip_default_attrs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Add a graph with two float32 variables and a Complex Op composing them
@@ -941,8 +946,10 @@ class SavedModelTest(test.TestCase):
     self.assertIn("Tout", node_def.attr)
 
   def testStripDefaultAttrsInconsistentConsumerDefaults(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_strip_default_attrs_no_consumer_defaults")
+    if ops._USE_C_API: return  # TODO(skyewm): get this working
+
+    export_dir = self._get_export_dir(
+        "test_strip_default_attrs_no_consumer_defaults")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Add a graph with two float32 variables and a Complex Op composing them
diff --git a/tensorflow/python/saved_model/simple_save.py b/tensorflow/python/saved_model/simple_save.py
index 9a81e5cd807..1e4cc733706 100644
--- a/tensorflow/python/saved_model/simple_save.py
+++ b/tensorflow/python/saved_model/simple_save.py
@@ -40,17 +40,20 @@ def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
     - It will be treated as a graph for inference / serving (i.e. uses the tag
       `tag_constants.SERVING`)
     - The SavedModel will load in TensorFlow Serving and supports the
-      [Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
+      [Predict
+      API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
       To use the Classify, Regress, or MultiInference APIs, please
       use either
       [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
       or the lower level
-      [SavedModel APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+      [SavedModel
+      APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
     - Some TensorFlow ops depend on information on disk or other information
       called "assets". These are generally handled automatically by adding the
       assets to the `GraphKeys.ASSET_FILEPATHS` collection. Only assets in that
       collection are exported; if you need more custom behavior, you'll need to
-      use the [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
+      use the
+      [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
 
   More information about SavedModel and signatures can be found here:
   https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md.
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 12f120116f4..1f3f2287043 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
 
@@ -276,6 +277,7 @@ class SummaryToEventTransformer(object):
     self.event_writer.add_event(event)
 
 
+@tf_export("summary.FileWriter")
 class FileWriter(SummaryToEventTransformer):
   """Writes `Summary` protocol buffers to event files.
 
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index bad289303c0..645fa28a37f 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -22,8 +22,10 @@ import threading
 
 from tensorflow.python.framework import ops
 from tensorflow.python.summary.writer.writer import FileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('summary.FileWriterCache')
 class FileWriterCache(object):
   """Cache for file writers.
 
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index a2e86a1c43a..fd78f44c999 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -251,7 +251,7 @@ def main(unused_args):
                FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
                FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
                FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags, checkpoint_version=checkpoint_version)
+               FLAGS.saved_model_tags, FLAGS.checkpoint_version)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index c2687bf557b..9c192712225 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -349,6 +349,7 @@ def fold_batch_norms(input_graph_def):
     bias_add_op.op = "BiasAdd"
     bias_add_op.name = node.name
     bias_add_op.attr["T"].CopyFrom(conv_op.attr["T"])
+    bias_add_op.attr["data_format"].CopyFrom(conv_op.attr["data_format"])
     bias_add_op.input.extend([new_conv_op.name, offset_op.name])
     new_ops.extend([scaled_weights_op, new_conv_op, offset_op, bias_add_op])
 
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 6dd24c0dca1..2ef612473b4 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import image_ops
@@ -38,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
 
 
+@test_util.with_c_api
 class OptimizeForInferenceTest(test.TestCase):
 
   def create_node_def(self, op, name, inputs):
@@ -145,7 +147,7 @@ class OptimizeForInferenceTest(test.TestCase):
           np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
       gamma_op = constant_op.constant(
           np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 8
+      test_util.set_producer_version(ops.get_default_graph(), 8)
       gen_nn_ops._batch_norm_with_global_normalization(
           conv_op,
           mean_op,
@@ -171,48 +173,53 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
   def testFoldFusedBatchNorms(self):
-    with self.test_session() as sess:
-      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
-      input_op = constant_op.constant(
-          np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32)
-      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
-      weights_op = constant_op.constant(
-          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
-      conv_op = nn_ops.conv2d(
-          input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op")
-      mean_op = constant_op.constant(
-          np.array([10, 20]), shape=[2], dtype=dtypes.float32)
-      variance_op = constant_op.constant(
-          np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
-      beta_op = constant_op.constant(
-          np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
-      gamma_op = constant_op.constant(
-          np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 9
-      gen_nn_ops._fused_batch_norm(
-          conv_op,
-          gamma_op,
-          beta_op,
-          mean_op,
-          variance_op,
-          0.00001,
-          is_training=False,
-          name="output")
-      original_graph_def = sess.graph_def
-      original_result = sess.run(["output:0"])
-    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
-        original_graph_def)
+    for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
+        input_op = constant_op.constant(
+            np.array(inputs),
+            shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6],
+            dtype=dtypes.float32)
+        weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
+        weights_op = constant_op.constant(
+            np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
+        conv_op = nn_ops.conv2d(
+            input_op, weights_op, [1, 1, 1, 1], padding="SAME",
+            data_format=data_format, name="conv_op")
+        mean_op = constant_op.constant(
+            np.array([10, 20]), shape=[2], dtype=dtypes.float32)
+        variance_op = constant_op.constant(
+            np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
+        beta_op = constant_op.constant(
+            np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
+        gamma_op = constant_op.constant(
+            np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
+        ops.get_default_graph().graph_def_versions.producer = 9
+        gen_nn_ops._fused_batch_norm(
+            conv_op,
+            gamma_op,
+            beta_op,
+            mean_op,
+            variance_op,
+            0.00001,
+            is_training=False,
+            data_format=data_format,
+            name="output")
+        original_graph_def = sess.graph_def
+        original_result = sess.run(["output:0"])
+      optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+          original_graph_def)
 
-    with self.test_session() as sess:
-      _ = importer.import_graph_def(
-          optimized_graph_def, input_map={}, name="optimized")
-      optimized_result = sess.run(["optimized/output:0"])
+      with self.test_session(use_gpu=use_gpu) as sess:
+        _ = importer.import_graph_def(
+            optimized_graph_def, input_map={}, name="optimized")
+        optimized_result = sess.run(["optimized/output:0"])
 
-    self.assertAllClose(
-        original_result, optimized_result, rtol=1e-04, atol=1e-06)
+      self.assertAllClose(
+          original_result, optimized_result, rtol=1e-04, atol=1e-06)
 
-    for node in optimized_graph_def.node:
-      self.assertNotEqual("FusedBatchNorm", node.op)
+      for node in optimized_graph_def.node:
+        self.assertNotEqual("FusedBatchNorm", node.op)
 
   def testFuseResizePadAndConv(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 21e8e803fcb..5b0a584c10e 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -31,6 +31,7 @@ import warnings
 
 import numpy as np
 
+from six import integer_types
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.example import example_pb2
@@ -440,7 +441,7 @@ def _create_example_string(example_dict):
     elif isinstance(feature_list[0], str):
       example.features.feature[feature_name].bytes_list.value.extend(
           feature_list)
-    elif isinstance(feature_list[0], (int, long)):
+    elif isinstance(feature_list[0], integer_types):
       example.features.feature[feature_name].int64_list.value.extend(
           feature_list)
     else:
diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 13c07cfd7bf..c08e3cca007 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdadeltaOptimizer")
 class AdadeltaOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adadelta algorithm.
 
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index afa192f7cc6..deb4e6f5463 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -25,8 +25,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradOptimizer")
 class AdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adagrad algorithm.
 
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index b3f9ea323c2..5ba403554f5 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradDAOptimizer")
 class AdagradDAOptimizer(optimizer.Optimizer):
   """Adagrad Dual Averaging algorithm for sparse linear models.
 
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 0c69f8bf399..c92f6fc3015 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -26,8 +26,10 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdamOptimizer")
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 52b0f421061..7af821c8192 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -18,8 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import errors
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.basic_train_loop")
 def basic_train_loop(supervisor, train_step_fn, args=None,
                      kwargs=None, master=""):
   """Basic loop to train a model.
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 752d585cd17..aae757b99aa 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -47,6 +47,7 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 from tensorflow.python.training.summary_io import SummaryWriterCache
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _HookTimer(object):
@@ -85,6 +86,7 @@ class _HookTimer(object):
     raise NotImplementedError
 
 
+@tf_export("train.SecondOrStepTimer")
 class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
@@ -164,6 +166,7 @@ class NeverTriggerTimer(_HookTimer):
     return None
 
 
+@tf_export("train.LoggingTensorHook")
 class LoggingTensorHook(session_run_hook.SessionRunHook):
   """Prints the given tensors every N local steps, every N seconds, or at end.
 
@@ -262,6 +265,7 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       self._log_tensors(values)
 
 
+@tf_export("train.StopAtStepHook")
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
 
@@ -317,6 +321,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.CheckpointSaverListener")
 class CheckpointSaverListener(object):
   """Interface for listeners that take action before or after checkpoint save.
 
@@ -331,7 +336,7 @@ class CheckpointSaverListener(object):
   `CheckpointSaverHook`, as in this example:
 
   ```python
-  class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+  class ExampleCheckpointSaverListener(CheckpointSaverListener):
     def begin(self):
       # You can add ops to the graph here.
       print('Starting the session.')
@@ -347,7 +352,7 @@ class CheckpointSaverListener(object):
       print('Done with the session.')
 
   ...
-  listener = ExampleCheckpointSaverListerner()
+  listener = ExampleCheckpointSaverListener()
   saver_hook = tf.train.CheckpointSaverHook(
       checkpoint_dir, listeners=[listener])
   with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
@@ -375,6 +380,7 @@ class CheckpointSaverListener(object):
     pass
 
 
+@tf_export("train.CheckpointSaverHook")
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -497,6 +503,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     return savers[0]
 
 
+@tf_export("train.StepCounterHook")
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Hook that counts steps per second."""
 
@@ -575,12 +582,14 @@ class StepCounterHook(session_run_hook.SessionRunHook):
     self._last_global_step = stale_global_step
 
 
+@tf_export("train.NanLossDuringTrainingError")
 class NanLossDuringTrainingError(RuntimeError):
 
   def __str__(self):
     return "NaN loss during training."
 
 
+@tf_export("train.NanTensorHook")
 class NanTensorHook(session_run_hook.SessionRunHook):
   """Monitors the loss tensor and stops training if loss is NaN.
 
@@ -612,6 +621,7 @@ class NanTensorHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.SummarySaverHook")
 class SummarySaverHook(session_run_hook.SessionRunHook):
   """Saves summaries every N steps."""
 
@@ -720,6 +730,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
     return summary_op
 
 
+@tf_export("train.GlobalStepWaiterHook")
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """Delays execution until global step reaches `wait_until_step`.
 
@@ -767,6 +778,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
       time.sleep(0.5)
 
 
+@tf_export("train.FinalOpsHook")
 class FinalOpsHook(session_run_hook.SessionRunHook):
   """A hook which evaluates `Tensors` at the end of a session."""
 
@@ -793,6 +805,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
                                            feed_dict=self._final_ops_feed_dict)
 
 
+@tf_export("train.FeedFnHook")
 class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
@@ -810,6 +823,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
+@tf_export("train.ProfilerHook")
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index b5d3e787971..fa3de6fad27 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -36,6 +37,7 @@ __all__ = [
 ]
 
 
+@tf_export("train.load_checkpoint")
 def load_checkpoint(ckpt_dir_or_file):
   """Returns `CheckpointReader` for checkpoint found in `ckpt_dir_or_file`.
 
@@ -60,6 +62,7 @@ def load_checkpoint(ckpt_dir_or_file):
   return pywrap_tensorflow.NewCheckpointReader(filename)
 
 
+@tf_export("train.load_variable")
 def load_variable(ckpt_dir_or_file, name):
   """Returns the tensor value of the given variable in the checkpoint.
 
@@ -77,6 +80,7 @@ def load_variable(ckpt_dir_or_file, name):
   return reader.get_tensor(name)
 
 
+@tf_export("train.list_variables")
 def list_variables(ckpt_dir_or_file):
   """Returns list of all variables in the checkpoint.
 
@@ -95,6 +99,7 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
+@tf_export("train.init_from_checkpoint")
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Initializes current variables with tensors loaded from given checkpoint.
 
@@ -242,6 +247,9 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
           full_tensor_name = full_tensor_name[1:]
         if tensor_name_in_ckpt != "/":
           full_tensor_name = tensor_name_in_ckpt + full_tensor_name
+        # Remove trailing '/', if any, in the full_tensor_name
+        if full_tensor_name.endswith("/"):
+          full_tensor_name = full_tensor_name[:-1]
         if full_tensor_name not in variable_map:
           raise ValueError(
               "Tensor %s (%s in %s) is not found in %s checkpoint" % (
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 0e31255b74f..0ff97d85e37 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -27,8 +27,10 @@ import six
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Coordinator")
 class Coordinator(object):
   """A coordinator for threads.
 
@@ -406,6 +408,7 @@ class Coordinator(object):
 
 
 # Threads for the standard services.
+@tf_export("train.LooperThread")
 class LooperThread(threading.Thread):
   """A thread that runs code repeatedly, optionally on a timer.
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 37ab625779f..689088bb41e 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -23,6 +23,7 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _RoundRobinStrategy(object):
@@ -121,6 +122,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
+@tf_export("train.replica_device_setter")
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index c64a1b3f799..9d02e694db1 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.FtrlOptimizer")
 class FtrlOptimizer(optimizer.Optimizer):
   """Optimizer that implements the FTRL algorithm.
 
@@ -265,4 +267,3 @@ class FtrlOptimizer(optimizer.Optimizer):
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
-
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 5a536e27297..380e14e0249 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.GradientDescentOptimizer")
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
   """
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 331a51e8bc8..bd9985a7c5c 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
@@ -53,9 +54,12 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 # pylint: enable=protected-access
 
 
+@tf_export("train.match_filenames_once")
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
 
+  NOTE: The order of the files returned can be non-deterministic.
+
   Args:
     pattern: A file pattern (glob), or 1D tensor of file patterns.
     name: A name for the operations (optional).
@@ -70,6 +74,7 @@ def match_filenames_once(pattern, name=None):
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
 
+@tf_export("train.limit_epochs")
 def limit_epochs(tensor, num_epochs=None, name=None):
   """Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
 
@@ -102,6 +107,7 @@ def limit_epochs(tensor, num_epochs=None, name=None):
       return array_ops.identity(tensor, name=name)
 
 
+@tf_export("train.input_producer")
 def input_producer(input_tensor,
                    element_shape=None,
                    num_epochs=None,
@@ -184,6 +190,7 @@ def input_producer(input_tensor,
     return q
 
 
+@tf_export("train.string_input_producer")
 def string_input_producer(string_tensor,
                           num_epochs=None,
                           shuffle=True,
@@ -253,6 +260,7 @@ def string_input_producer(string_tensor,
         cancel_op=cancel_op)
 
 
+@tf_export("train.range_input_producer")
 def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces the integers from 0 to limit-1 in a queue.
@@ -290,6 +298,7 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
         shared_name, "fraction_of_%d_full" % capacity, name)
 
 
+@tf_export("train.slice_input_producer")
 def slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces a slice of each `Tensor` in `tensor_list`.
@@ -885,6 +894,7 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
 # Batching functions ----------------------------------------------------------
 
 
+@tf_export("train.batch")
 def batch(tensors, batch_size, num_threads=1, capacity=32,
           enqueue_many=False, shapes=None, dynamic_pad=False,
           allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -979,6 +989,7 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.maybe_batch")
 def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
                 enqueue_many=False, shapes=None, dynamic_pad=False,
                 allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1031,6 +1042,7 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.batch_join")
 def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
                shapes=None, dynamic_pad=False, allow_smaller_final_batch=False,
                shared_name=None, name=None):
@@ -1136,6 +1148,7 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
       name=name)
 
 
+@tf_export("train.maybe_batch_join")
 def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
                      enqueue_many=False, shapes=None, dynamic_pad=False,
                      allow_smaller_final_batch=False, shared_name=None,
@@ -1188,6 +1201,7 @@ def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
       name=name)
 
 
+@tf_export("train.shuffle_batch")
 def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                   num_threads=1, seed=None, enqueue_many=False, shapes=None,
                   allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1287,6 +1301,7 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch")
 def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                         keep_input, num_threads=1, seed=None,
                         enqueue_many=False, shapes=None,
@@ -1346,6 +1361,7 @@ def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.shuffle_batch_join")
 def shuffle_batch_join(tensors_list, batch_size, capacity,
                        min_after_dequeue, seed=None, enqueue_many=False,
                        shapes=None, allow_smaller_final_batch=False,
@@ -1439,6 +1455,7 @@ def shuffle_batch_join(tensors_list, batch_size, capacity,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch_join")
 def maybe_shuffle_batch_join(tensors_list, batch_size, capacity,
                              min_after_dequeue, keep_input, seed=None,
                              enqueue_many=False, shapes=None,
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 3ee49650e01..10ab4c1137f 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Various learning rate decay functions."""
 from __future__ import absolute_import
 from __future__ import division
@@ -26,10 +25,16 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
-def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+@tf_export("train.exponential_decay")
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies exponential decay to the learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -85,9 +90,9 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
   """
   if global_step is None:
     raise ValueError("global_step is required for exponential_decay.")
-  with ops.name_scope(name, "ExponentialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, decay_rate]) as name:
+  with ops.name_scope(
+      name, "ExponentialDecay",
+      [learning_rate, global_step, decay_steps, decay_rate]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -96,10 +101,11 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
     p = global_step / decay_steps
     if staircase:
       p = math_ops.floor(p)
-    return math_ops.multiply(learning_rate, math_ops.pow(decay_rate, p),
-                             name=name)
+    return math_ops.multiply(
+        learning_rate, math_ops.pow(decay_rate, p), name=name)
 
 
+@tf_export("train.piecewise_constant")
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
@@ -156,15 +162,15 @@ def piecewise_constant(x, boundaries, values, name=None):
           boundaries[i] = b
         else:
           raise ValueError(
-              "Boundaries (%s) must have the same dtype as x (%s)." % (
-                  b.dtype.base_dtype, x.dtype.base_dtype))
+              "Boundaries (%s) must have the same dtype as x (%s)." %
+              (b.dtype.base_dtype, x.dtype.base_dtype))
     # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing.
     values = ops.convert_n_to_tensor(values)
     for v in values[1:]:
       if v.dtype.base_dtype != values[0].dtype.base_dtype:
         raise ValueError(
-            "Values must have elements all with the same dtype (%s vs %s)." % (
-                values[0].dtype.base_dtype, v.dtype.base_dtype))
+            "Values must have elements all with the same dtype (%s vs %s)." %
+            (values[0].dtype.base_dtype, v.dtype.base_dtype))
     pred_fn_pairs = []
     pred_fn_pairs.append((x <= boundaries[0], lambda: values[0]))
     pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1]))
@@ -179,9 +185,14 @@ def piecewise_constant(x, boundaries, values, name=None):
     return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
 
 
-def polynomial_decay(learning_rate, global_step, decay_steps,
-                     end_learning_rate=0.0001, power=1.0,
-                     cycle=False, name=None):
+@tf_export("train.polynomial_decay")
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False,
+                     name=None):
   """Applies a polynomial decay to the learning rate.
 
   It is commonly observed that a monotonically decreasing learning rate, whose
@@ -255,9 +266,10 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
   """
   if global_step is None:
     raise ValueError("global_step is required for polynomial_decay.")
-  with ops.name_scope(name, "PolynomialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, end_learning_rate, power]) as name:
+  with ops.name_scope(
+      name, "PolynomialDecay",
+      [learning_rate, global_step, decay_steps, end_learning_rate, power
+      ]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -267,23 +279,29 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
     if cycle:
       # Find the first multiple of decay_steps that is bigger than global_step.
       # If global_step is zero set the multiplier to 1
-      multiplier = control_flow_ops.cond(math_ops.equal(global_step, 0),
-                                         lambda: 1.0,
-                                         lambda: math_ops.ceil(
-                                             global_step / decay_steps))
+      multiplier = control_flow_ops.cond(
+          math_ops.equal(global_step, 0), lambda: 1.0,
+          lambda: math_ops.ceil(global_step / decay_steps))
       decay_steps = math_ops.multiply(decay_steps, multiplier)
     else:
       # Make sure that the global_step used is not bigger than decay_steps.
       global_step = math_ops.minimum(global_step, decay_steps)
 
     p = math_ops.div(global_step, decay_steps)
-    return math_ops.add(math_ops.multiply(learning_rate - end_learning_rate,
-                                          math_ops.pow(1 - p, power)),
-                        end_learning_rate, name=name)
+    return math_ops.add(
+        math_ops.multiply(learning_rate - end_learning_rate,
+                          math_ops.pow(1 - p, power)),
+        end_learning_rate,
+        name=name)
 
 
-def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+@tf_export("train.natural_exp_decay")
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies natural exponential decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -349,8 +367,13 @@ def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.multiply(learning_rate, exponent, name=name)
 
 
-def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
-                       staircase=False, name=None):
+@tf_export("train.inverse_time_decay")
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False,
+                       name=None):
   """Applies inverse time decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -362,13 +385,15 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
+  decay_step)
   ```
 
   or, if `staircase` is `True`, as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
+  decay_step))
   ```
 
   Example: decay 1/t with a rate of 0.5:
@@ -379,7 +404,8 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   learning_rate = 0.1
   decay_steps = 1.0
   decay_rate = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate)
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step,
+  decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
@@ -424,8 +450,8 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.div(learning_rate, denom, name=name)
 
 
-def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
-                 name=None):
+@tf_export("train.cosine_decay")
+def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   """Applies cosine decay to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
@@ -484,8 +510,14 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
     return math_ops.multiply(learning_rate, decayed)
 
 
-def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
-                          t_mul=2.0, m_mul=1.0, alpha=0.0, name=None):
+@tf_export("train.cosine_decay_restarts")
+def cosine_decay_restarts(learning_rate,
+                          global_step,
+                          first_decay_steps,
+                          t_mul=2.0,
+                          m_mul=1.0,
+                          alpha=0.0,
+                          name=None):
   """Applies cosine decay with restarts to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
@@ -532,10 +564,9 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
   """
   if global_step is None:
     raise ValueError("cosine decay restarts requires global_step")
-  with ops.name_scope(name, "SGDRDecay",
-                      [learning_rate, global_step]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate,
-                                          name="initial_learning_rate")
+  with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name:
+    learning_rate = ops.convert_to_tensor(
+        learning_rate, name="initial_learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
     first_decay_steps = math_ops.cast(first_decay_steps, dtype)
@@ -547,11 +578,12 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
 
     def compute_step(completed_fraction, geometric=False):
       if geometric:
-        i_restart = math_ops.floor(math_ops.log(1.0 - completed_fraction * (
-            1.0 - t_mul)) / math_ops.log(t_mul))
+        i_restart = math_ops.floor(
+            math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
+            math_ops.log(t_mul))
 
-        sum_r = (1.0 - t_mul ** i_restart) / (1.0 - t_mul)
-        completed_fraction = (completed_fraction - sum_r) / t_mul ** i_restart
+        sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+        completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
 
       else:
         i_restart = math_ops.floor(completed_fraction)
@@ -564,16 +596,21 @@ def cosine_decay_restarts(learning_rate, global_step, first_decay_steps,
         lambda: compute_step(completed_fraction, geometric=False),
         lambda: compute_step(completed_fraction, geometric=True))
 
-    m_fac = m_mul ** i_restart
-    cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
-        constant_op.constant(math.pi) * completed_fraction))
+    m_fac = m_mul**i_restart
+    cosine_decayed = 0.5 * m_fac * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
     decayed = (1 - alpha) * cosine_decayed + alpha
 
   return math_ops.multiply(learning_rate, decayed, name=name)
 
 
-def linear_cosine_decay(learning_rate, global_step, decay_steps,
-                        num_periods=0.5, alpha=0.0, beta=0.001,
+@tf_export("train.linear_cosine_decay")
+def linear_cosine_decay(learning_rate,
+                        global_step,
+                        decay_steps,
+                        num_periods=0.5,
+                        alpha=0.0,
+                        beta=0.001,
                         name=None):
   """Applies linear cosine decay to the learning rate.
 
@@ -651,9 +688,15 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps,
     return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
 
 
-def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
-                              initial_variance=1.0, variance_decay=0.55,
-                              num_periods=0.5, alpha=0.0, beta=0.001,
+@tf_export("train.noisy_linear_cosine_decay")
+def noisy_linear_cosine_decay(learning_rate,
+                              global_step,
+                              decay_steps,
+                              initial_variance=1.0,
+                              variance_decay=0.55,
+                              num_periods=0.5,
+                              alpha=0.0,
+                              beta=0.001,
                               name=None):
   """Applies noisy linear cosine decay to the learning rate.
 
@@ -734,8 +777,8 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
         math_ops.pow(1.0 + global_step, variance_decay))
     std = math_ops.sqrt(variance)
     noisy_linear_decayed = (
-        linear_decayed + random_ops.random_normal(
-            linear_decayed.shape, stddev=std))
+        linear_decayed +
+        random_ops.random_normal(linear_decayed.shape, stddev=std))
 
     completed_fraction = global_step / decay_steps
     fraction = 2.0 * num_periods * completed_fraction
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index cf9530d87c4..bd9fa79d8fe 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.MomentumOptimizer")
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index fa3517db27b..6c5c9e01a76 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -41,6 +41,7 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # The list of exceptions that we should recover from. Exceptions not in this
@@ -52,6 +53,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
+@tf_export('train.Scaffold')
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -272,6 +274,7 @@ class Scaffold(object):
         resources.initialize_resources(resources.local_resources()))
 
 
+@tf_export('train.MonitoredTrainingSession')
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
                              checkpoint_dir=None,
@@ -381,6 +384,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                           stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SessionCreator')
 class SessionCreator(object):
   """A factory for tf.Session."""
 
@@ -390,6 +394,7 @@ class SessionCreator(object):
         'create_session is not implemented for {}.'.format(self))
 
 
+@tf_export('train.ChiefSessionCreator')
 class ChiefSessionCreator(SessionCreator):
   """Creates a tf.Session for a chief."""
 
@@ -441,6 +446,7 @@ class ChiefSessionCreator(SessionCreator):
         init_fn=self._scaffold.init_fn)
 
 
+@tf_export('train.WorkerSessionCreator')
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
@@ -706,6 +712,7 @@ class _MonitoredSession(object):
     return self._coordinated_creator.tf_sess
 
 
+@tf_export('train.MonitoredSession')
 class MonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, recovery and hooks.
 
@@ -788,6 +795,7 @@ class MonitoredSession(_MonitoredSession):
         stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SingularMonitoredSession')
 class SingularMonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, restoring, and hooks.
 
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 43ed1ac170d..2d89082ad75 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(touts): switch to variables.Variable.
@@ -230,6 +231,7 @@ def _zero_debias(unbiased_var, value, decay):
       return unbiased_ema_delta
 
 
+@tf_export("train.ExponentialMovingAverage")
 class ExponentialMovingAverage(object):
   """Maintains moving averages of variables by employing an exponential decay.
 
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 719b83e5ca5..9ec588bac96 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _get_variable_for(v):
@@ -187,6 +188,7 @@ def _get_processor(v):
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
+@tf_export("train.Optimizer")
 class Optimizer(object):
   """Base class for optimizers.
 
@@ -533,7 +535,15 @@ class Optimizer(object):
       else:
         with ops.control_dependencies([self._finish(update_ops, "update")]):
           with ops.colocate_with(global_step):
-            apply_updates = state_ops.assign_add(global_step, 1, name=name)
+            if isinstance(global_step, resource_variable_ops.ResourceVariable):
+              # TODO(apassos): the implicit read in assign_add is slow; consider
+              # making it less so.
+              apply_updates = resource_variable_ops.assign_add_variable_op(
+                  global_step.handle,
+                  ops.convert_to_tensor(1, dtype=global_step.dtype),
+                  name=name)
+            else:
+              apply_updates = state_ops.assign_add(global_step, 1, name=name)
 
       if context.in_graph_mode():
         if isinstance(apply_updates, ops.Tensor):
@@ -592,7 +602,7 @@ class Optimizer(object):
       if executing_eagerly:
         # No variable.op in eager mode. We don't expect lots of eager graphs,
         # but behavior should be consistent with graph mode.
-        return variable._container_prefix == current_graph._container_prefix  # pylint: disable=protected-access
+        return variable._graph_key == current_graph._graph_key  # pylint: disable=protected-access
       else:
         return variable.op.graph is current_graph
 
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index da31ab325d5..9bd677b8efc 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -23,8 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalAdagradOptimizer")
 class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 53e9dc2ef2c..369b6cbb50e 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -24,8 +24,10 @@ from tensorflow.python.ops import math_ops
 # pylint: enable=unused-import
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalGradientDescentOptimizer")
 class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index 4e7c81d7b29..07afba79abf 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -27,8 +27,10 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.queue_runner.QueueRunner", "train.QueueRunner")
 class QueueRunner(object):
   """Holds a list of enqueue operations for a queue, each to be run in a thread.
 
@@ -384,6 +386,7 @@ class QueueRunner(object):
                        import_scope=import_scope)
 
 
+@tf_export("train.queue_runner.add_queue_runner", "train.add_queue_runner")
 def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Adds a `QueueRunner` to a collection in the graph.
 
@@ -402,6 +405,8 @@ def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   ops.add_to_collection(collection, qr)
 
 
+@tf_export("train.queue_runner.start_queue_runners",
+           "train.start_queue_runners")
 def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
                         collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Starts all queue runners collected in the graph.
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index ebec725b7b9..89d1099a49f 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """One-line documentation for rmsprop module.
 
 rmsprop algorithm [tieleman2012rmsprop]
@@ -47,12 +46,15 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.RMSPropOptimizer")
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
-  See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  See the
+  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
   """
 
   def __init__(self,
@@ -113,13 +115,12 @@ class RMSPropOptimizer(optimizer.Optimizer):
       self._zeros_slot(v, "momentum", self._name)
 
   def _prepare(self):
-    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
-                                                       name="learning_rate")
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        self._learning_rate, name="learning_rate")
     self._decay_tensor = ops.convert_to_tensor(self._decay, name="decay")
-    self._momentum_tensor = ops.convert_to_tensor(self._momentum,
-                                                  name="momentum")
-    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon,
-                                                 name="epsilon")
+    self._momentum_tensor = ops.convert_to_tensor(
+        self._momentum, name="momentum")
+    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon, name="epsilon")
 
   def _apply_dense(self, grad, var):
     rms = self.get_slot(var, "rms")
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 4f3773c0fc7..764f8400122 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -53,6 +53,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Op names which identify variable reads which should be saved.
@@ -889,6 +890,7 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
+@tf_export("train.generate_checkpoint_state_proto")
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
                                     all_model_checkpoint_paths=None):
@@ -933,6 +935,7 @@ def generate_checkpoint_state_proto(save_dir,
   return coord_checkpoint_proto
 
 
+@tf_export("train.update_checkpoint_state")
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
@@ -1025,6 +1028,7 @@ def _update_checkpoint_state(save_dir,
                                       text_format.MessageToString(ckpt))
 
 
+@tf_export("train.get_checkpoint_state")
 def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   """Returns CheckpointState proto from the "checkpoint" file.
 
@@ -1082,6 +1086,7 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   return ckpt
 
 
+@tf_export("train.Saver")
 class Saver(object):
   """Saves and restores variables.
 
@@ -1229,7 +1234,7 @@ class Saver(object):
         The `saver_def` proto should be the one returned by the
         `as_saver_def()` call of the `Saver` that was created for that `Graph`.
       builder: Optional `SaverBuilder` to use if a `saver_def` was not provided.
-        Defaults to `BaseSaverBuilder()`.
+        Defaults to `BulkSaverBuilder()`.
       defer_build: If `True`, defer adding the save and restore ops to the
         `build()` call. In that case `build()` should be called before
         finalizing the graph or using the saver.
@@ -1309,7 +1314,7 @@ class Saver(object):
 
     if not self.saver_def or context.in_eager_mode():
       if self._builder is None:
-        self._builder = BaseSaverBuilder(self._write_version)
+        self._builder = BulkSaverBuilder(self._write_version)
 
       if self._var_list is None:
         # pylint: disable=protected-access
@@ -1788,6 +1793,7 @@ def _prefix_to_checkpoint_path(prefix, format_version):
   return prefix  # Just the data file.
 
 
+@tf_export("train.latest_checkpoint")
 def latest_checkpoint(checkpoint_dir, latest_filename=None):
   """Finds the filename of latest saved checkpoint file.
 
@@ -1817,6 +1823,7 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
+@tf_export("train.import_meta_graph")
 def import_meta_graph(meta_graph_or_file, clear_devices=False,
                       import_scope=None, **kwargs):
   """Recreates a Graph saved in a `MetaGraphDef` proto.
@@ -1918,6 +1925,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
       return None
 
 
+@tf_export("train.export_meta_graph")
 def export_meta_graph(filename=None,
                       meta_info_def=None,
                       graph_def=None,
@@ -1994,6 +2002,7 @@ def export_meta_graph(filename=None,
   return meta_graph_def
 
 
+@tf_export("train.checkpoint_exists")
 def checkpoint_exists(checkpoint_prefix):
   """Checks whether a V1 or V2 checkpoint exists with the specified prefix.
 
@@ -2018,6 +2027,7 @@ def checkpoint_exists(checkpoint_prefix):
     return False
 
 
+@tf_export("train.get_checkpoint_mtimes")
 def get_checkpoint_mtimes(checkpoint_prefixes):
   """Returns the mtimes (modification timestamps) of the checkpoints.
 
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 29da67a30a5..2f421d1cc0a 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -23,6 +23,7 @@ from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
@@ -92,6 +93,7 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
   return server_def
 
 
+@tf_export("train.Server")
 class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
@@ -221,6 +223,7 @@ class Server(object):
                   start=start)
 
 
+@tf_export("train.ClusterSpec")
 class ClusterSpec(object):
   """Represents a cluster as a set of "tasks", organized into "jobs".
 
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index b396a1e7d0a..360e02fb44c 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _maybe_name(obj):
@@ -44,6 +45,7 @@ def _maybe_name(obj):
     return "<no name for %s>" % type(obj)
 
 
+@tf_export("train.SessionManager")
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 5b023d8a267..89f40300650 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -96,8 +96,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.SessionRunHook")
 class SessionRunHook(object):
   """Hook to extend calls to MonitoredSession.run()."""
 
@@ -189,6 +191,7 @@ class SessionRunHook(object):
     pass
 
 
+@tf_export("train.SessionRunArgs")
 class SessionRunArgs(
     collections.namedtuple("SessionRunArgs",
                            ["fetches", "feed_dict", "options"])):
@@ -213,6 +216,7 @@ class SessionRunArgs(
     return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options)
 
 
+@tf_export("train.SessionRunContext")
 class SessionRunContext(object):
   """Provides information about the `session.run()` call being made.
 
@@ -264,6 +268,7 @@ class SessionRunContext(object):
     self._stop_requested = True
 
 
+@tf_export("train.SessionRunValues")
 class SessionRunValues(
     collections.namedtuple("SessionRunValues",
                            ["results", "options", "run_metadata"])):
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index e4514aaea22..d2ad34773e0 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -37,8 +37,10 @@ from tensorflow.python.training import saver as saver_mod
 from tensorflow.python.training import session_manager as session_manager_mod
 from tensorflow.python.training import training_util
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Supervisor")
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 47702fdad05..0c6cf910d1a 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import session_manager
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Please note that the gradients from replicas are averaged instead of summed
@@ -38,6 +39,7 @@ from tensorflow.python.training import session_run_hook
 # rate according to the number of replicas. This change is introduced to be
 # consistent with how gradients are aggregated (averaged) within a batch in a
 # replica.
+@tf_export("train.SyncReplicasOptimizer")
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 89a9e129328..499f1feb2db 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Picked a long key value to minimize the chance of collision with user defined
@@ -40,6 +41,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 write_graph = graph_io.write_graph
 
 
+@tf_export('train.global_step')
 def global_step(sess, global_step_tensor):
   """Small helper to get the global step.
 
@@ -67,6 +69,7 @@ def global_step(sess, global_step_tensor):
   return int(sess.run(global_step_tensor))
 
 
+@tf_export('train.get_global_step')
 def get_global_step(graph=None):
   """Get the global step tensor.
 
@@ -101,6 +104,7 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.create_global_step')
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -139,6 +143,7 @@ def create_global_step(graph=None):
                      ops.GraphKeys.GLOBAL_STEP])
 
 
+@tf_export('train.get_or_create_global_step')
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -156,6 +161,7 @@ def get_or_create_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.assert_global_step')
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 270d96a3c7c..7e5f192b8f1 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -41,8 +41,10 @@ import numpy as _np
 import six as _six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -65,6 +67,7 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
+@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,6 +96,7 @@ else:
   as_str = as_text
 
 
+@tf_export('compat.as_str_any')
 def as_str_any(value):
   """Converts to `str` as `str(value)`, but use `as_str` for `bytes`.
 
@@ -125,11 +129,16 @@ def path_to_str(path):
 # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we
 # need to check them specifically.  The same goes from Real and Complex.
 integral_types = (_numbers.Integral, _np.integer)
+tf_export('compat.integral_types').export_constant(__name__, 'integral_types')
 real_types = (_numbers.Real, _np.integer, _np.floating)
+tf_export('compat.real_types').export_constant(__name__, 'real_types')
 complex_types = (_numbers.Complex, _np.number)
+tf_export('compat.complex_types').export_constant(__name__, 'complex_types')
 
 # Either bytes or text.
 bytes_or_text_types = (bytes, _six.text_type)
+tf_export('compat.bytes_or_text_types').export_constant(__name__,
+                                                        'bytes_or_text_types')
 
 _allowed_symbols = [
     'as_str',
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 8a66f0435a8..fbec8fd2d8e 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import re
 
+from tensorflow.python.eager import context
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
@@ -38,13 +39,14 @@ _PRINTED_WARNING = {}
 
 def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
   """Adds a deprecation notice to a docstring for deprecated functions."""
+  main_text = ['THIS FUNCTION IS DEPRECATED. It will be removed %s.' %
+               ('in a future version' if date is None else ('after %s' % date))]
+  if instructions:
+    main_text.append('Instructions for updating:')
   return decorator_utils.add_notice_to_docstring(
       doc, instructions,
       'DEPRECATED FUNCTION',
-      '(deprecated)', [
-          'THIS FUNCTION IS DEPRECATED. It will be removed %s.' % (
-              'in a future version' if date is None else ('after %s' % date)),
-          'Instructions for updating:'])
+      '(deprecated)', main_text)
 
 
 def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
@@ -66,23 +68,135 @@ def _validate_deprecation_args(date, instructions):
     raise ValueError('Don\'t deprecate things without conversion instructions!')
 
 
-def _call_location():
+def _call_location(outer=False):
   """Returns call location given level up from current call."""
   frame = tf_inspect.currentframe()
   if frame:
     # CPython internals are available, use them for performance.
     # walk back two frames to get to deprecated function caller.
-    first_frame = frame.f_back
-    second_frame = first_frame.f_back
-    frame = second_frame if second_frame else first_frame
+    frame = frame.f_back
+    if frame.f_back:
+      frame = frame.f_back
+    if outer and frame.f_back:
+      frame = frame.f_back
     return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
   else:
     # Slow fallback path
     stack = tf_inspect.stack(0)  # 0 avoids generating unused context
-    entry = stack[2]
+    entry = stack[3 if outer else 2]
     return '%s:%d' % (entry[1], entry[2])
 
 
+def deprecated_alias(deprecated_name, name, func_or_class, warn_once=True):
+  """Deprecate a symbol in favor of a new name with identical semantics.
+
+  This function is meant to be used when defining a backwards-compatibility
+  alias for a symbol which has been moved. For example:
+
+  module1.py:
+  ```python
+  class NewNameForClass: pass
+  ```
+
+  module2.py:
+  ```python
+  import module1
+
+  DeprecatedNameForClass = deprecated_alias(
+    deprecated_name='module2.DeprecatedNameForClass',
+    name='module1.NewNameForClass',
+    module1.NewNameForClass)
+  ```
+
+  This function works for classes and functions.
+
+  For classes, it creates a new class which is functionally identical (it
+  inherits from the original, and overrides its constructor), but which prints
+  a deprecation warning when an instance is created. It also adds a deprecation
+  notice to the class' docstring.
+
+  For functions, it returns a function wrapped by `tf_decorator.make_decorator`.
+  That function prints a warning when used, and has a deprecation notice in its
+  docstring. This is more or less equivalent (the deprecation warning has
+  slightly different text) to writing:
+
+  ```python
+  @deprecated
+  def deprecated_alias(original_args):
+    real_function(original_args)
+  ```
+
+  Args:
+    deprecated_name: The name of the symbol that is being deprecated, to be used
+      in the warning message. This should be its fully qualified name to avoid
+      confusion.
+    name: The name of the symbol that is to be used instead of the deprecated
+      name. This should be a fully qualified name to avoid confusion.
+    func_or_class: The (non-deprecated) class or function for which a deprecated
+      alias should be created.
+    warn_once: If True (the default), only print a deprecation warning the first
+      time this function is used, or the class is instantiated.
+
+  Returns:
+    A wrapped version of `func_or_class` which prints a deprecation warning on
+    use and has a modified docstring.
+  """
+  if tf_inspect.isclass(func_or_class):
+
+    # Make a new class with __init__ wrapped in a warning.
+    class NewClass(func_or_class):  # pylint: disable=missing-docstring
+      __doc__ = decorator_utils.add_notice_to_docstring(
+          func_or_class.__doc__, 'Please use %s instead.' % name,
+          'DEPRECATED CLASS',
+          '(deprecated)', ['THIS CLASS IS DEPRECATED. '
+                           'It will be removed in a future version. '])
+      __name__ = func_or_class.__name__
+      __module__ = _call_location(outer=True)
+
+      def __init__(self, *args, **kwargs):
+        if hasattr(NewClass.__init__, '__func__'):
+          # Python 2
+          NewClass.__init__.__func__.__doc__ = func_or_class.__init__.__doc__
+        else:
+          # Python 3
+          NewClass.__init__.__doc__ = func_or_class.__init__.__doc__
+
+        if _PRINT_DEPRECATION_WARNINGS:
+          # We're making the alias as we speak. The original may have other
+          # aliases, so we cannot use it to check for whether it's already been
+          # warned about.
+          if NewClass.__init__ not in _PRINTED_WARNING:
+            if warn_once:
+              _PRINTED_WARNING[NewClass.__init__] = True
+            logging.warning(
+                'From %s: The name %s is deprecated. Please use %s instead.\n',
+                _call_location(), deprecated_name, name)
+        super(NewClass, self).__init__(*args, **kwargs)
+
+    return NewClass
+  else:
+    decorator_utils.validate_callable(func_or_class, 'deprecated')
+
+    # Make a wrapper for the original
+    @functools.wraps(func_or_class)
+    def new_func(*args, **kwargs):  # pylint: disable=missing-docstring
+      if _PRINT_DEPRECATION_WARNINGS:
+        # We're making the alias as we speak. The original may have other
+        # aliases, so we cannot use it to check for whether it's already been
+        # warned about.
+        if new_func not in _PRINTED_WARNING:
+          if warn_once:
+            _PRINTED_WARNING[new_func] = True
+          logging.warning(
+              'From %s: The name %s is deprecated. Please use %s instead.\n',
+              _call_location(), deprecated_name, name)
+      return func_or_class(*args, **kwargs)
+    return tf_decorator.make_decorator(
+        func_or_class, new_func, 'deprecated',
+        _add_deprecated_function_notice_to_docstring(
+            func_or_class.__doc__, None, 'Please use %s instead.' % name))
+
+
 def deprecated(date, instructions, warn_once=True):
   """Decorator for marking functions or methods deprecated.
 
@@ -284,7 +398,9 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
-      if _PRINT_DEPRECATION_WARNINGS:
+      # TODO(apassos) figure out a way to have reasonable performance with
+      # deprecation warnings and eager mode.
+      if context.in_graph_mode() and _PRINT_DEPRECATION_WARNINGS:
         invalid_args = []
         named_args = tf_inspect.getcallargs(func, *args, **kwargs)
         for arg_name, spec in iter(deprecated_positions.items()):
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index e61edb5cfa3..bdd0bc48d29 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -24,6 +24,56 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
 
+class DeprecatedAliasTest(test.TestCase):
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_function_alias(self, mock_warning):
+    deprecated_func = deprecation.deprecated_alias("deprecated.func",
+                                                   "real.func",
+                                                   logging.error)
+
+    logging.error("fake error logged")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_func("FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_func("ANOTHER FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_class_alias(self, mock_warning):
+    class MyClass(object):
+      """My docstring."""
+
+      init_args = []
+
+      def __init__(self, arg):
+        MyClass.init_args.append(arg)
+
+    deprecated_cls = deprecation.deprecated_alias("deprecated.cls",
+                                                  "real.cls",
+                                                  MyClass)
+
+    print(deprecated_cls.__name__)
+    print(deprecated_cls.__module__)
+    print(deprecated_cls.__doc__)
+
+    MyClass("test")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_cls("deprecated")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_cls("deprecated again")
+    self.assertEqual(1, mock_warning.call_count)
+
+    self.assertEqual(["test", "deprecated", "deprecated again"],
+                     MyClass.init_args)
+
+
 class DeprecationTest(test.TestCase):
 
   @test.mock.patch.object(logging, "warning", autospec=True)
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 874df3d1087..c8525ed4203 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -532,8 +532,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
             (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
 
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 6bec397db57..8aaf799fd05 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -425,6 +425,10 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
   def testFlattenUpTo(self):
     # Shallow tree ends at scalar.
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index a23d6a70ba2..d1a8aae1674 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -23,6 +23,14 @@ namespace gputools {
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
     const std::function<ExecutorFactory>& factory) {
+  // In the fast path case, the cache already has an entry and we can just
+  // return after Get() which only takes a shared lock and not a unique lock.
+  // If we need to create, we take a unique lock on cache_.
+  auto fast_result = Get(config);
+  if (fast_result.ok()) {
+    return fast_result;
+  }
+
   Entry* entry = nullptr;
   {
     mutex_lock lock{mutex_};
@@ -59,12 +67,17 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
     const StreamExecutorConfig& config) {
   Entry* entry = nullptr;
   {
-    mutex_lock lock{mutex_};
-    entry = &cache_[config.ordinal];
-    // Release the map lock; the address of 'entry' is stable because
-    // std::map guarantees reference stability.
+    tf_shared_lock lock{mutex_};
+    auto it = cache_.find(config.ordinal);
+    if (it != cache_.end()) {
+      entry = &it->second;
+    } else {
+      return port::Status(port::error::NOT_FOUND,
+                          port::Printf("No executors registered for ordinal %d",
+                                       config.ordinal));
+    }
   }
-  mutex_lock lock{entry->configurations_mutex};
+  tf_shared_lock lock{entry->configurations_mutex};
   if (entry->configurations.empty()) {
     return port::Status(
         port::error::NOT_FOUND,
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index cc32a6beaa5..f23224ae772 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -45,7 +45,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     const string& target) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformMap()->find(port::Lowercase(target));
 
   if (it == GetPlatformMap()->end()) {
@@ -59,7 +59,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformByIdMap()->find(id);
   if (it == GetPlatformByIdMap()->end()) {
     return port::Status(
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index d1103163959..66bbd572a67 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -77,6 +77,16 @@ genrule(
         "api/nn/rnn_cell/__init__.py",
         "api/sets/__init__.py",
         "api/summary/__init__.py",
+        "api/train/queue_runner/__init__.py",
+        "api/compat/__init__.py",
+        "api/data/__init__.py",
+        "api/estimator/__init__.py",
+        "api/estimator/export/__init__.py",
+        "api/estimator/inputs/__init__.py",
+        "api/feature_column/__init__.py",
+        "api/losses/__init__.py",
+        "api/profiler/__init__.py",
+        "api/python_io/__init__.py",
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
index ab697b1b95b..874a73f661d 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
index b73f6433e22..8da2a2b6867 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
index dbcc187f945..aa6ac46613f 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "mode"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "prediction_hooks"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "predictions"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
new file mode 100644
index 00000000000..0b841652851
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.manip"
+tf_module {
+  member_method {
+    name: "roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index db1ed421851..e8890e9cc0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "losses"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "manip"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
@@ -2044,6 +2048,10 @@ tf_module {
     name: "unique_with_counts"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unravel_index"
+    argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 8fb6b1cdfd8..608a34ab7b3 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -17,10 +17,6 @@ py_test(
     name = "api_compatibility_test",
     srcs = ["api_compatibility_test.py"],
     data = [
-        ":convert_from_multiline",
-        "//tensorflow/core/api_def:base_api_def",
-        "//tensorflow/core/api_def:python_api_def",
-        "//tensorflow/python:hidden_ops",
         "//tensorflow/tools/api/golden:api_golden",
         "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
         "//tensorflow/tools/api/tests:README.txt",
@@ -29,7 +25,6 @@ py_test(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index afcbf50944c..c1e09cc531e 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -28,10 +28,8 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-from collections import defaultdict
 import os
 import re
-import subprocess
 import sys
 import unittest
 
@@ -39,7 +37,6 @@ import tensorflow as tf
 
 from google.protobuf import text_format
 
-from tensorflow.core.framework import api_def_pb2
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -67,11 +64,6 @@ _API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden'
 _TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
 _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 
-_CONVERT_FROM_MULTILINE_SCRIPT = 'tensorflow/tools/api/tests/convert_from_multiline'
-_BASE_API_DIR = 'tensorflow/core/api_def/base_api'
-_PYTHON_API_DIR = 'tensorflow/core/api_def/python_api'
-_HIDDEN_OPS_FILE = 'tensorflow/python/ops/hidden_ops.txt'
-
 
 def _KeyToFilePath(key):
   """From a given key, construct a filepath."""
@@ -96,55 +88,6 @@ def _FileNameToKey(filename):
   return api_object_key
 
 
-def _GetSymbol(symbol_id):
-  """Get TensorFlow symbol based on the given identifier.
-
-  Args:
-    symbol_id: Symbol identifier in the form module1.module2. ... .sym.
-
-  Returns:
-    Symbol corresponding to the given id.
-  """
-  # Ignore first module which should be tensorflow
-  symbol_id_split = symbol_id.split('.')[1:]
-  symbol = tf
-  for sym in symbol_id_split:
-    symbol = getattr(symbol, sym)
-  return symbol
-
-
-def _IsGenModule(module_name):
-  if not module_name:
-    return False
-  module_name_split = module_name.split('.')
-  return module_name_split[-1].startswith('gen_')
-
-
-def _GetHiddenOps():
-  hidden_ops_file = file_io.FileIO(_HIDDEN_OPS_FILE, 'r')
-  hidden_ops = set()
-  for line in hidden_ops_file:
-    line = line.strip()
-    if not line:
-      continue
-    if line[0] == '#':  # comment line
-      continue
-    # If line is of the form "op_name # comment", only keep the op_name.
-    line_split = line.split('#')
-    hidden_ops.add(line_split[0].strip())
-  return hidden_ops
-
-
-def _GetGoldenApiDefs():
-  old_api_def_files = file_io.get_matching_files(_GetApiDefFilePath('*'))
-  return {file_path: file_io.read_file_to_string(file_path)
-          for file_path in old_api_def_files}
-
-
-def _GetApiDefFilePath(graph_op_name):
-  return os.path.join(_PYTHON_API_DIR, 'api_def_%s.pbtxt' % graph_op_name)
-
-
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -287,188 +230,6 @@ class ApiCompatibilityTest(test.TestCase):
         update_goldens=FLAGS.update_goldens)
 
 
-class ApiDefTest(test.TestCase):
-
-  def __init__(self, *args, **kwargs):
-    super(ApiDefTest, self).__init__(*args, **kwargs)
-    self._first_cap_pattern = re.compile('(.)([A-Z][a-z]+)')
-    self._all_cap_pattern = re.compile('([a-z0-9])([A-Z])')
-
-  def _GenerateLowerCaseOpName(self, op_name):
-    lower_case_name = self._first_cap_pattern.sub(r'\1_\2', op_name)
-    return self._all_cap_pattern.sub(r'\1_\2', lower_case_name).lower()
-
-  def _CreatePythonApiDef(self, base_api_def, endpoint_names):
-    """Creates Python ApiDef that overrides base_api_def if needed.
-
-    Args:
-      base_api_def: (api_def_pb2.ApiDef) base ApiDef instance.
-      endpoint_names: List of Python endpoint names.
-
-    Returns:
-      api_def_pb2.ApiDef instance with overrides for base_api_def
-      if module.name endpoint is different from any existing
-      endpoints in base_api_def. Otherwise, returns None.
-    """
-    endpoint_names_set = set(endpoint_names)
-
-    # If the only endpoint is equal to graph_op_name then
-    # it is equivalent to having no endpoints.
-    if (not base_api_def.endpoint and len(endpoint_names) == 1
-        and endpoint_names[0] ==
-        self._GenerateLowerCaseOpName(base_api_def.graph_op_name)):
-      return None
-
-    base_endpoint_names_set = {
-        self._GenerateLowerCaseOpName(endpoint.name)
-        for endpoint in base_api_def.endpoint}
-
-    if endpoint_names_set == base_endpoint_names_set:
-      return None  # All endpoints are the same
-
-    api_def = api_def_pb2.ApiDef()
-    api_def.graph_op_name = base_api_def.graph_op_name
-
-    for endpoint_name in sorted(endpoint_names):
-      new_endpoint = api_def.endpoint.add()
-      new_endpoint.name = endpoint_name
-
-    return api_def
-
-  def _GetBaseApiMap(self):
-    """Get a map from graph op name to its base ApiDef.
-
-    Returns:
-      Dictionary mapping graph op name to corresponding ApiDef.
-    """
-    # Convert base ApiDef in Multiline format to Proto format.
-    converted_base_api_dir = os.path.join(
-        test.get_temp_dir(), 'temp_base_api_defs')
-    subprocess.check_call(
-        [os.path.join(resource_loader.get_root_dir_with_all_resources(),
-                      _CONVERT_FROM_MULTILINE_SCRIPT),
-         _BASE_API_DIR, converted_base_api_dir])
-
-    name_to_base_api_def = {}
-    base_api_files = file_io.get_matching_files(
-        os.path.join(converted_base_api_dir, 'api_def_*.pbtxt'))
-    for base_api_file in base_api_files:
-      if file_io.file_exists(base_api_file):
-        api_defs = api_def_pb2.ApiDefs()
-        text_format.Merge(
-            file_io.read_file_to_string(base_api_file), api_defs)
-        for api_def in api_defs.op:
-          name_to_base_api_def[api_def.graph_op_name] = api_def
-    return name_to_base_api_def
-
-  def _AddHiddenOpOverrides(self, name_to_base_api_def, api_def_map):
-    """Adds ApiDef overrides to api_def_map for hidden Python ops.
-
-    Args:
-      name_to_base_api_def: Map from op name to base api_def_pb2.ApiDef.
-      api_def_map: Map from file path to api_def_pb2.ApiDefs for Python API
-        overrides.
-    """
-    hidden_ops = _GetHiddenOps()
-    for hidden_op in hidden_ops:
-      if hidden_op not in name_to_base_api_def:
-        logging.warning('Unexpected hidden op name: %s' % hidden_op)
-        continue
-
-      base_api_def = name_to_base_api_def[hidden_op]
-      if base_api_def.visibility != api_def_pb2.ApiDef.HIDDEN:
-        api_def = api_def_pb2.ApiDef()
-        api_def.graph_op_name = base_api_def.graph_op_name
-        api_def.visibility = api_def_pb2.ApiDef.HIDDEN
-
-        file_path = _GetApiDefFilePath(base_api_def.graph_op_name)
-        api_def_map[file_path].op.extend([api_def])
-
-  @unittest.skipUnless(
-      sys.version_info.major == 2 and os.uname()[0] == 'Linux',
-      'API compabitility test goldens are generated using python2 on Linux.')
-  def testAPIDefCompatibility(self):
-    # Get base ApiDef
-    name_to_base_api_def = self._GetBaseApiMap()
-    snake_to_camel_graph_op_names = {
-        self._GenerateLowerCaseOpName(name): name
-        for name in name_to_base_api_def.keys()}
-    # Extract Python API
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    traverse.traverse(tf, public_api_visitor)
-    proto_dict = visitor.GetProtos()
-
-    # Map from file path to Python ApiDefs.
-    new_api_defs_map = defaultdict(api_def_pb2.ApiDefs)
-    # We need to override all endpoints even if 1 endpoint differs from base
-    # ApiDef. So, we first create a map from an op to all its endpoints.
-    op_to_endpoint_name = defaultdict(list)
-
-    # Generate map from generated python op to endpoint names.
-    for public_module, value in proto_dict.items():
-      module_obj = _GetSymbol(public_module)
-      for sym in value.tf_module.member_method:
-        obj = getattr(module_obj, sym.name)
-
-        # Check if object is defined in gen_* module. That is,
-        # the object has been generated from OpDef.
-        if hasattr(obj, '__module__') and _IsGenModule(obj.__module__):
-          if obj.__name__ not in snake_to_camel_graph_op_names:
-            # Symbol might be defined only in Python and not generated from
-            # C++ api.
-            continue
-          relative_public_module = public_module[len('tensorflow.'):]
-          full_name = (relative_public_module + '.' + sym.name
-                       if relative_public_module else sym.name)
-          op_to_endpoint_name[obj].append(full_name)
-
-    # Generate Python ApiDef overrides.
-    for op, endpoint_names in op_to_endpoint_name.items():
-      graph_op_name = snake_to_camel_graph_op_names[op.__name__]
-      api_def = self._CreatePythonApiDef(
-          name_to_base_api_def[graph_op_name], endpoint_names)
-
-      if api_def:
-        file_path = _GetApiDefFilePath(graph_op_name)
-        api_defs = new_api_defs_map[file_path]
-        api_defs.op.extend([api_def])
-
-    self._AddHiddenOpOverrides(name_to_base_api_def, new_api_defs_map)
-
-    old_api_defs_map = _GetGoldenApiDefs()
-    for file_path, new_api_defs in new_api_defs_map.items():
-      # Get new ApiDef string.
-      new_api_defs_str = str(new_api_defs)
-
-      # Get current ApiDef for the given file.
-      old_api_defs_str = (
-          old_api_defs_map[file_path] if file_path in old_api_defs_map else '')
-
-      if old_api_defs_str == new_api_defs_str:
-        continue
-
-      if FLAGS.update_goldens:
-        logging.info('Updating %s...' % file_path)
-        file_io.write_string_to_file(file_path, new_api_defs_str)
-      else:
-        self.assertMultiLineEqual(
-            old_api_defs_str, new_api_defs_str,
-            'To update golden API files, run api_compatibility_test locally '
-            'with --update_goldens=True flag.')
-
-    for file_path in set(old_api_defs_map) - set(new_api_defs_map):
-      if FLAGS.update_goldens:
-        logging.info('Deleting %s...' % file_path)
-        file_io.delete_file(file_path)
-      else:
-        self.fail(
-            '%s file is no longer needed and should be removed.'
-            'To update golden API files, run api_compatibility_test locally '
-            'with --update_goldens=True flag.' % file_path)
-
-
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 27fa1b89ceb..b3a8ff2ac72 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -183,7 +183,9 @@ do_pylint() {
   # W0311 bad-indentation
   # W0312 mixed-indentation
   # C0330 bad-continuation
-  grep -E '(\[E|\[W0311|\[W0312|\[C0330)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # C0301 line-too-long
+  # C0326 bad-whitespace
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
@@ -319,7 +321,7 @@ do_external_licenses_check(){
   EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log"
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
- bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
+ bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -328,7 +330,7 @@ do_external_licenses_check(){
 
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
-  bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
+  bazel query "deps(${LICENSES_TARGET})" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -342,6 +344,18 @@ do_external_licenses_check(){
 
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
+  # Blacklist
+  echo ${MISSING_LICENSES_FILE}
+  grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -v ${MISSING_LICENSES_FILE} > temp.txt
+  mv temp.txt ${MISSING_LICENSES_FILE}
+
+  # Whitelist
+  echo ${EXTRA_LICENSE_FILE}
+  grep -e "@bazel_tools//src/" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  mv temp.txt ${EXTRA_LICENSES_FILE}
+
+
+
   echo
   echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
   echo
@@ -515,9 +529,14 @@ do_check_futures_test() {
   python check_futures_test.py
 }
 
+do_check_file_name_test() {
+  cd "$ROOT_DIR/tensorflow/tools/test"
+  python file_name_test.py
+}
+
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 71744c04f2f..d406b83a624 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -43,8 +43,8 @@ pip2 install --upgrade werkzeug==0.11.10
 pip3 install --upgrade werkzeug==0.11.10
 
 # Install bleach. html5lib will be picked up as a dependency.
-pip2 install --upgrade bleach==1.5.0
-pip3 install --upgrade bleach==1.5.0
+pip2 install --upgrade bleach==2.0.0
+pip3 install --upgrade bleach==2.0.0
 
 # Install markdown.
 pip2 install --upgrade markdown==2.6.8
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index e71017e621c..68fdb617166 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -180,7 +180,17 @@ docstring-min-length=10
 max-line-length=80
 
 # Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+ignore-long-lines=(?x)
+  (^\s*(import|from)\s
+   |\$Id:\s\/\/depot\/.+#\d+\s\$
+   |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
+   |^\s*\#\ LINT\.ThenChange
+   |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
+   |pylint
+   |"""
+   |\#
+   |lambda
+   |(https?|ftp):)
 
 # Allow the body of an if to be on the same line as the test if there is no
 # else.
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 957729bb37d..c1bc7185075 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -36,7 +36,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 5a362de3992..b87e4a9bec4 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -37,7 +37,7 @@ SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index fa28e3d79ca..583d1d5f095 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -41,7 +41,7 @@ run_configure_for_cpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 573c926203f..94276c6c5c9 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index f678681dac2..6e90b286c99 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -46,8 +46,9 @@ class APIChangeSpec(object):
   """
 
 
-class _FileEditTuple(collections.namedtuple(
-    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
+class _FileEditTuple(
+    collections.namedtuple("_FileEditTuple",
+                           ["comment", "line", "start", "old", "new"])):
   """Each edit that is recorded by a _FileEditRecorder.
 
   Fields:
@@ -179,8 +180,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     function_renames = self._api_change_spec.function_renames
     try:
       new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name,
-                                                         new_name),
+      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
                           node.lineno, node.col_offset, full_name, new_name)
     except KeyError:
       pass
@@ -227,7 +227,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
       # loop over lines
       while 1:
         # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line-1]
+        text = self._lines[line - 1]
         reversed_preceding_text = text[:col][::-1]
         # First find if a [ can be found with only whitespace between it and
         # col.
@@ -248,8 +248,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # node ranges to filter out spurious #'s that appear in string
             # literals.
             comment_start = prev_line.find("#")
-            if comment_start ==  -1:
-              col = len(prev_line) -1
+            if comment_start == -1:
+              col = len(prev_line) - 1
             elif find_string_chars.search(prev_line[comment_start:]) is None:
               col = comment_start
             else:
@@ -260,7 +260,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
     # it is not possible to use that in an argument.
     return node.lineno, node.col_offset
 
-
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
 
@@ -268,7 +267,6 @@ class _ASTCallVisitor(ast.NodeVisitor):
       node: Current Node
     """
 
-
     # Find a simple attribute name path e.g. "tf.foo.bar"
     full_name = self._get_attribute_full_path(node.func)
 
@@ -293,18 +291,21 @@ class _ASTCallVisitor(ast.NodeVisitor):
           lineno, col_offset = self._find_true_position(arg)
           if lineno is None or col_offset is None:
             self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r"
-                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
-                "", "",
+                "Failed to add keyword %r to reordered function %r" %
+                (reordered[idx], full_name),
+                arg.lineno,
+                arg.col_offset,
+                "",
+                "",
                 error="A necessary keyword argument failed to be inserted.")
           else:
             keyword_arg = reordered[idx]
             if (full_name in function_keyword_renames and
                 keyword_arg in function_keyword_renames[full_name]):
               keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r"
-                                % (reordered[idx], full_name), lineno,
-                                col_offset, "", keyword_arg + "=")
+            self._file_edit.add("Added keyword %r to reordered function %r" %
+                                (reordered[idx], full_name), lineno, col_offset,
+                                "", keyword_arg + "=")
 
       # Examine each keyword argument and convert it to the final renamed form
       renamed_keywords = ({} if full_name not in function_keyword_renames else
@@ -322,11 +323,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
             # value.
             key_start = argval_col_offset - len(argkey) - 1
             key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] ==
-                argkey + "="):
+            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
+                "="):
               self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey, renamed_keywords[argkey]),
-                                  argval_lineno,
+                                  (argkey,
+                                   renamed_keywords[argkey]), argval_lineno,
                                   argval_col_offset - len(argkey) - 1,
                                   argkey + "=", renamed_keywords[argkey] + "=")
               continue
@@ -335,7 +336,8 @@ class _ASTCallVisitor(ast.NodeVisitor):
               (argkey, renamed_keywords[argkey]),
               argval.lineno,
               argval.col_offset - len(argkey) - 1,
-              "", "",
+              "",
+              "",
               error="Failed to find keyword lexographically. Fix manually.")
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -352,7 +354,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
     if full_name in self._api_change_spec.change_to_function:
       if not hasattr(node, "is_function_for_call"):
         new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
+        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
                             node.lineno, node.col_offset, full_name, new_text)
 
     ast.NodeVisitor.generic_visit(self, node)
@@ -380,8 +382,8 @@ class ASTCodeUpgrader(object):
     # Write to a temporary file, just in case we are doing an implace modify.
     with open(in_filename, "r") as in_file, \
         tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(
-          in_filename, in_file, out_filename, temp_file)
+      ret = self.process_opened_file(in_filename, in_file, out_filename,
+                                     temp_file)
 
     shutil.move(temp_file.name, out_filename)
     return ret
@@ -424,6 +426,7 @@ class ASTCodeUpgrader(object):
         out_file.write(out_text)
     text += "\n"
     return 1, text, process_errors
+
   # pylint: enable=broad-except
 
   def process_tree(self, root_directory, output_root_directory,
@@ -444,16 +447,16 @@ class ASTCodeUpgrader(object):
 
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." % (
-          output_root_directory))
+      print("Output directory %r must not already exist." %
+            (output_root_directory))
       sys.exit(1)
 
     # make sure output directory does not overlap with root_directory
     norm_root = os.path.split(os.path.normpath(root_directory))
     norm_output = os.path.split(os.path.normpath(output_root_directory))
     if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" % (
-          root_directory, output_root_directory))
+      print("Output directory %r same as input directory %r" %
+            (root_directory, output_root_directory))
       sys.exit(1)
 
     # Collect list of files to process (we do this to correctly handle if the
@@ -465,14 +468,16 @@ class ASTCodeUpgrader(object):
       copy_files = [f for f in file_list if not f.endswith(".py")]
       for filename in py_files:
         fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(
-            output_root_directory, os.path.relpath(fullpath, root_directory))
+        fullpath_output = os.path.join(output_root_directory,
+                                       os.path.relpath(fullpath,
+                                                       root_directory))
         files_to_process.append((fullpath, fullpath_output))
       if copy_other_files:
         for filename in copy_files:
           fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(
-              output_root_directory, os.path.relpath(fullpath, root_directory))
+          fullpath_output = os.path.join(output_root_directory,
+                                         os.path.relpath(
+                                             fullpath, root_directory))
           files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
@@ -641,18 +646,17 @@ class TFAPIChangeSpec(APIChangeSpec):
         "tf.concat": ["concat_dim", "values", "name"],
         "tf.svd": ["tensor", "compute_uv", "full_matrices", "name"],
         "tf.nn.softmax_cross_entropy_with_logits": [
-            "logits", "labels", "dim", "name"],
+            "logits", "labels", "dim", "name"
+        ],
         "tf.nn.sparse_softmax_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
-        "tf.nn.sigmoid_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
+            "logits", "labels", "name"
+        ],
+        "tf.nn.sigmoid_cross_entropy_with_logits": ["logits", "labels", "name"],
         "tf.op_scope": ["values", "name", "default_name"],
     }
 
     # Specially handled functions.
-    self.function_handle = {
-        "tf.reverse": self._reverse_handler
-    }
+    self.function_handle = {"tf.reverse": self._reverse_handler}
 
   @staticmethod
   def _reverse_handler(file_edit_recorder, node):
@@ -661,12 +665,13 @@ class TFAPIChangeSpec(APIChangeSpec):
     comment = ("ERROR: tf.reverse has had its argument semantics changed\n"
                "significantly the converter cannot detect this reliably, so you"
                "need to inspect this usage manually.\n")
-    file_edit_recorder.add(comment,
-                           node.lineno,
-                           node.col_offset,
-                           "tf.reverse",
-                           "tf.reverse",
-                           error="tf.reverse requires manual check.")
+    file_edit_recorder.add(
+        comment,
+        node.lineno,
+        node.col_offset,
+        "tf.reverse",
+        "tf.reverse",
+        error="tf.reverse requires manual check.")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh
index 878fabd248f..225c0347416 100755
--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@@ -16,14 +16,15 @@
 #
 # Builds the test server for distributed (GRPC) TensorFlow
 #
-# Usage: build_server.sh <docker_image_name> <whl_url> [--test]
+# Usage: build_server.sh <docker_image_name> <whl_file_location> [--test]
 #
 # Arguments:
 #   docker_image_name: Name of the docker image to build.
 #     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
 #
-#   whl_url: URL from which the TensorFlow whl file will be downloaded.
+#   whl_file_location: URL from which the TensorFlow whl file will be downloaded.
 #     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#     E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
 # testing GRPC server. Without the flag, the script will build the non-test
@@ -41,11 +42,11 @@ die() {
 
 # Check arguments
 if [[ $# -lt 2 ]]; then
-  die "Usage: $0 <docker_image_name> <whl_url> [--test]"
+  die "Usage: $0 <docker_image_name> <whl_location> [--test]"
 fi
 
 DOCKER_IMG_NAME=$1
-WHL_URL=$2
+WHL_FILE_LOCATION=$2
 shift 2
 
 # Current script directory
@@ -53,7 +54,7 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 BUILD_DIR=$(mktemp -d)
 echo ""
-echo "Using whl file URL: ${WHL_URL}"
+echo "Using whl file URL: ${WHL_FILE_LOCATION}"
 echo "Building in temporary directory: ${BUILD_DIR}"
 
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
@@ -65,9 +66,15 @@ if [[ $1 == "--test" ]]; then
 fi
 echo "Using Docker file: ${DOCKER_FILE}"
 
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+    # Download whl file into the build context directory.
+    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
+        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
+else
+    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
+fi
+
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 
 if [[ ! -f "${DOCKER_FILE}" ]]; then
   die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index e40ecb43f9a..a2d12442c44 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Distributed MNIST training and validation, with model replicas.
 
 A simple softmax model with one hidden layer is defined. The parameters
@@ -32,7 +31,6 @@ perform forward computation and gradient calculation in parallel, which
 should lead to increased training speed for the simple model.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -45,7 +43,6 @@ import time
 import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data
 
-
 flags = tf.app.flags
 flags.DEFINE_string("data_dir", "/tmp/mnist-data",
                     "Directory for storing mnist data")
@@ -56,8 +53,7 @@ flags.DEFINE_integer("task_index", None,
                      "Worker task index, should be >= 0. task_index=0 is "
                      "the master worker task the performs the variable "
                      "initialization ")
-flags.DEFINE_integer("num_gpus", 1,
-                     "Total number of gpus for each machine."
+flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine."
                      "If you don't use GPU, please set it to '0'")
 flags.DEFINE_integer("replicas_to_aggregate", None,
                      "Number of replicas to aggregate before parameter update"
@@ -69,24 +65,24 @@ flags.DEFINE_integer("train_steps", 200,
                      "Number of (global) training steps to perform")
 flags.DEFINE_integer("batch_size", 100, "Training batch size")
 flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
-flags.DEFINE_boolean("sync_replicas", False,
-                     "Use the sync_replicas (synchronized replicas) mode, "
-                     "wherein the parameter updates from workers are aggregated "
-                     "before applied to avoid stale gradients")
+flags.DEFINE_boolean(
+    "sync_replicas", False,
+    "Use the sync_replicas (synchronized replicas) mode, "
+    "wherein the parameter updates from workers are aggregated "
+    "before applied to avoid stale gradients")
 flags.DEFINE_boolean(
     "existing_servers", False, "Whether servers already exists. If True, "
     "will use the worker hosts via their GRPC URLs (one client process "
     "per worker host). Otherwise, will create an in-process TensorFlow "
     "server.")
-flags.DEFINE_string("ps_hosts","localhost:2222",
+flags.DEFINE_string("ps_hosts", "localhost:2222",
                     "Comma-separated list of hostname:port pairs")
 flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224",
                     "Comma-separated list of hostname:port pairs")
-flags.DEFINE_string("job_name", None,"job name: worker or ps")
+flags.DEFINE_string("job_name", None, "job name: worker or ps")
 
 FLAGS = flags.FLAGS
 
-
 IMAGE_PIXELS = 28
 
 
@@ -97,7 +93,7 @@ def main(unused_argv):
 
   if FLAGS.job_name is None or FLAGS.job_name == "":
     raise ValueError("Must specify an explicit `job_name`")
-  if FLAGS.task_index is None or FLAGS.task_index =="":
+  if FLAGS.task_index is None or FLAGS.task_index == "":
     raise ValueError("Must specify an explicit `task_index`")
 
   print("job name = %s" % FLAGS.job_name)
@@ -110,9 +106,7 @@ def main(unused_argv):
   # Get the number of workers.
   num_workers = len(worker_spec)
 
-  cluster = tf.train.ClusterSpec({
-      "ps": ps_spec,
-      "worker": worker_spec})
+  cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
 
   if not FLAGS.existing_servers:
     # Not using existing servers. Create an in-process server.
@@ -217,7 +211,8 @@ def main(unused_argv):
     sess_config = tf.ConfigProto(
         allow_soft_placement=True,
         log_device_placement=False,
-        device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
+        device_filters=["/job:ps",
+                        "/job:worker/task:%d" % FLAGS.task_index])
 
     # The chief worker (task_index==0) session will prepare the session,
     # while the remaining workers will wait for the preparation to complete.
@@ -231,8 +226,7 @@ def main(unused_argv):
       server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
       print("Using existing server at: %s" % server_grpc_url)
 
-      sess = sv.prepare_or_wait_for_session(server_grpc_url,
-                                            config=sess_config)
+      sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
     else:
       sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
 
diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
index 0acbf6fcee5..05dcefb099a 100644
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ b/tensorflow/tools/docker/jupyter_notebook_config.py
@@ -15,6 +15,7 @@
 import os
 from IPython.lib import passwd
 
+c = c  # pylint:disable=undefined-variable
 c.NotebookApp.ip = '*'
 c.NotebookApp.port = int(os.getenv('PORT', 8888))
 c.NotebookApp.open_browser = False
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index b5df633800a..543b5fa6fef 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -162,7 +162,7 @@ def _build_class_page(page_info):
       parts.append(h3.format(**method_info.__dict__))
 
       if method_info.signature is not None:
-        parts.append(_build_signature(method_info))
+        parts.append(_build_signature(method_info, use_full_name=False))
 
       parts.append(method_info.doc.docstring)
       parts.append(_build_function_details(method_info.doc.function_details))
@@ -259,14 +259,14 @@ def _build_module_page(page_info):
   return ''.join(parts)
 
 
-def _build_signature(obj_info):
+def _build_signature(obj_info, use_full_name=True):
   """Returns a md code block showing the function signature."""
   # Special case tf.range, since it has an optional first argument
   if obj_info.full_name == 'tf.range':
     return (
         '``` python\n'
-        "range(limit, delta=1, dtype=None, name='range')\n"
-        "range(start, limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(start, limit, delta=1, dtype=None, name='range')\n"
         '```\n\n')
 
   parts = ['``` python']
@@ -281,7 +281,11 @@ def _build_signature(obj_info):
     sig = ',\n'.join('    %s' % sig_item for sig_item in obj_info.signature)
     sig = '\n'+sig+'\n'
 
-  parts.append(signature_template.format(name=obj_info.short_name, sig=sig))
+  if use_full_name:
+    obj_name = obj_info.full_name
+  else:
+    obj_name = obj_info.short_name
+  parts.append(signature_template.format(name=obj_name, sig=sig))
   parts.append('```\n\n')
 
   return '\n'.join(parts)
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 5ccd88cfa1a..a022f579267 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -183,22 +183,6 @@ Status ExtractRangeFromParams(const TransformFuncContext& context,
   return Status::OK();
 }
 
-bool AreAttrsEqual(const NodeDef* current_node, const NodeDef* other_node) {
-  if (current_node->attr_size() != other_node->attr_size()) {
-    return false;
-  }
-  string current_serialized;
-  string other_serialized;
-  for (const auto& attr : other_node->attr()) {
-    auto iter = current_node->attr().find(attr.first);
-    if (iter == current_node->attr().end()) return false;
-    iter->second.SerializeToString(&current_serialized);
-    attr.second.SerializeToString(&other_serialized);
-    if (current_serialized != other_serialized) return false;
-  }
-  return true;
-}
-
 }  // namespace
 
 // Analyzes all the nodes in the graph to figure out which ones are duplicates
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 96324d0deab..214ec721e2c 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,9 +29,10 @@ limitations under the License.
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
-using strings::StrCat;
 using str_util::Join;
 using str_util::Split;
+using str_util::StringReplace;
+using strings::StrCat;
 
 namespace graph_transforms {
 
@@ -84,48 +86,71 @@ void CreateConstNode(const Tensor& tensor, const string& name,
   SetNodeTensorAttr<float>("value", tensor, node_def);
 }
 
+string GetMonolithicTensorKey(const string& tensor_slice_name) {
+  std::vector<string> names = Split(tensor_slice_name, "/");
+  if (StringPiece(names[names.size() - 1]).starts_with("part_")) {
+    CHECK_GE(names.size(), 2);
+    names.pop_back();
+  }
+  return Join(names, "/");
+}
+
 Status ObtainTensorSlice(const GraphDef& input_graph_def,
-                         const string& tensor_name,
+                         const string& target_name,
                          string* shape_slice_string) {
   string restore_node_name;
   for (const auto& node : input_graph_def.node()) {
-    std::vector<string> node_name_parts = str_util::Split(node.name(), "/");
+    std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
         StringPiece(node_name_parts[0]).starts_with("save") &&
         StringPiece(node_name_parts[1]).starts_with("Assign") &&
-        node.input(0) == tensor_name) {
+        node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
     }
   }
+
+  std::vector<string> restore_node_parts = Split(restore_node_name, ":");
+  CHECK_LE(restore_node_parts.size(), 2);
+  string tensor_names_node;
   string shape_and_slices_node;
   for (const auto& node : input_graph_def.node()) {
-    if ((node.name() == restore_node_name) && (node.op() == "RestoreV2")) {
+    if ((node.name() == restore_node_parts[0]) && (node.op() == "RestoreV2")) {
+      tensor_names_node = node.input(1);
       shape_and_slices_node = node.input(2);
       break;
     }
   }
+
+  int offset = -1;
+  for (const auto& node : input_graph_def.node()) {
+    if (node.name() == tensor_names_node) {
+      Tensor tensor_names_tensor;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &tensor_names_tensor));
+      const auto& tensor_names_value = tensor_names_tensor.flat<string>();
+      for (int i = 0; i < tensor_names_value.size(); i++) {
+        if (tensor_names_value(i) == GetMonolithicTensorKey(target_name)) {
+          offset = i;
+          break;
+        }
+      }
+    }
+  }
+  if (offset == -1) {
+    return errors::Internal("Unable to find RestoreV2 entry for variable: ",
+                            target_name);
+  }
   for (const auto& node : input_graph_def.node()) {
     if (node.name() == shape_and_slices_node) {
       Tensor shape_and_slices_tensor;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &shape_and_slices_tensor));
       const auto& shape_and_slices_value =
           shape_and_slices_tensor.flat<string>();
-      *shape_slice_string = shape_and_slices_value(0);
+      *shape_slice_string = shape_and_slices_value(offset);
       return Status::OK();
     }
   }
-  return errors::Internal("Unable to find slice for variable: ", tensor_name);
-}
-
-string GetMonolithicTensorKey(const string& tensor_slice_name) {
-  std::vector<string> names = str_util::Split(tensor_slice_name, "/");
-  CHECK_GE(names.size(), 2);
-  CHECK(StringPiece(names[names.size() - 1]).starts_with("part_"));
-
-  // Remove the "part_x" suffix
-  names.pop_back();
-  return str_util::Join(names, "/");
+  return errors::Internal("Unable to find slice for variable: ", target_name);
 }
 
 Status ReadTensorFromCheckpoint(
@@ -179,6 +204,14 @@ Status ObtainVariableInfo(
   return Status::OK();
 }
 
+Status RemoveInputAtIndex(NodeDef* n, int index) {
+  for (int i = index; i < n->input_size() - 1; i++) {
+    n->mutable_input()->SwapElements(i, i + 1);
+  }
+  n->mutable_input()->RemoveLast();
+  return Status::OK();
+}
+
 Status SparsifyGatherInternal(
     const GraphDef& input_graph_def,
     const std::unique_ptr<std::unordered_map<string, string> >&
@@ -193,6 +226,15 @@ Status SparsifyGatherInternal(
   GraphDef current_graph_def = input_graph_def;
   bool any_match_found = false;
 
+  // Populate references.
+  std::unordered_map<string, int> refs;
+  for (const auto& node : current_graph_def.node()) {
+    for (const auto& input : node.input()) {
+      auto parsed_input = StringReplace(input, "^", "", true);
+      refs[parsed_input] += 1;
+    }
+  }
+
   // The subgraphs may have overlapping components, therefore GraphMatcher
   // doesn't return all subgraphs in one round -- this has to be multi-round
   // update.
@@ -200,15 +242,15 @@ Status SparsifyGatherInternal(
     any_match_found = false;
     GraphDef replaced_graph_def = current_graph_def;
     std::vector<string> init_table_node_names;
-    std::vector<string> removed_variable_names;
+    std::vector<string> removed_node_names;
 
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def, pattern,
         [&ckpt_reader, &any_match_found, &init_table_node_names,
-         &shapes_and_slices, &removed_variable_names](
-            const NodeMatch& match, const std::set<string>& input_nodes,
-            const std::set<string>& output_nodes,
-            std::vector<NodeDef>* new_nodes) {
+         &shapes_and_slices, &removed_node_names,
+         &refs](const NodeMatch& match, const std::set<string>& input_nodes,
+                const std::set<string>& output_nodes,
+                std::vector<NodeDef>* new_nodes) {
           any_match_found = true;
 
           // The captured subgraph should be of the following pattern:
@@ -290,9 +332,13 @@ Status SparsifyGatherInternal(
             TF_RETURN_IF_ERROR(ReadTensorFromCheckpoint(
                 weights_node.name(), ckpt_reader,
                 (*shapes_and_slices)[weights_node.name()], &weight));
-            // Add both both weight and identity node names.
-            removed_variable_names.push_back(weights_node.name());
-            removed_variable_names.push_back(match.inputs[0].node.name());
+          }
+          // Add both both weight and identity node names.
+          removed_node_names.push_back(weights_node.name());
+          removed_node_names.push_back(match.inputs[0].node.name());
+          for (auto input_node : match.inputs[0].node.input()) {
+            auto parsed_input = StringReplace(input_node, "^", "", true);
+            refs[parsed_input]--;
           }
           Tensor indices_tensor;
           Tensor values_tensor;
@@ -362,15 +408,23 @@ Status SparsifyGatherInternal(
 
           // Connect nodes
           AddNodeInput(hashtable_node.name(), &init_table_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(indices_node.name(), &init_table_node);
+          refs[indices_node.name()]++;
           AddNodeInput(values_node.name(), &init_table_node);
+          refs[values_node.name()]++;
 
           AddNodeInput(hashtable_node.name(), &lookup_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(gather_node.input(1), &lookup_node);
+          refs[gather_node.input(1)]++;
           AddNodeInput(default_value_node.name(), &lookup_node);
+          refs[default_value_node.name()]++;
 
           AddNodeInput(lookup_node.name(), &expand_dims_node);
+          refs[lookup_node.name()]++;
           AddNodeInput(dim_idx_node.name(), &expand_dims_node);
+          refs[dim_idx_node.name()]++;
 
           // Copy 'ids' input of original 'Gather'
           new_nodes->push_back(match.inputs[1].node);
@@ -404,48 +458,91 @@ Status SparsifyGatherInternal(
     for (const string& name : init_table_node_names) {
       // Add control dependence from init_table_node to group_deps_node
       AddNodeInput(StrCat("^", name), init_op);
+      refs[name]++;
     }
 
-    // Remove all dependencies associated with removed variables.
-    while (!removed_variable_names.empty()) {
-      auto name = removed_variable_names.back();
-      removed_variable_names.pop_back();
+    // Erase inputs and outputs as they are not considered for deletion.
+    for (const auto& output : context.output_names) {
+      refs.erase(output);
+    }
+
+    for (const auto& input : context.input_names) {
+      refs.erase(input);
+    }
+
+    // Add nodes with a reference count of 0 for deletion.
+    for (auto entry : refs) {
+      if (entry.second == 0) {
+        removed_node_names.push_back(entry.first);
+      }
+    }
+
+    while (!removed_node_names.empty()) {
+      auto name = removed_node_names.back();
+      removed_node_names.pop_back();
+
       int i = 0;
       while (i < replaced_graph_def.node_size()) {
-        if (!replaced_graph_def.node(i).input_size()) {
-          if (replaced_graph_def.node(i).name() == name) {
-            replaced_graph_def.mutable_node()->SwapElements(
-                i, replaced_graph_def.node_size() - 1);
-            replaced_graph_def.mutable_node()->RemoveLast();
-            continue;
-          }
-          i++;
-          continue;
-        }
-        int j = 0;
-        while (j < replaced_graph_def.node(i).input_size()) {
-          if (replaced_graph_def.node(i).input(j) == name ||
-              replaced_graph_def.node(i).input(j) == ("^" + name)) {
-            replaced_graph_def.mutable_node(i)->mutable_input()->SwapElements(
-                j, replaced_graph_def.node(i).input_size() - 1);
-            replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
-            continue;
-          }
-          j++;
-        }
-        if ((replaced_graph_def.node(i).input_size() == 0) ||
-            (replaced_graph_def.node(i).op() == "Assign" &&
-             replaced_graph_def.node(i).input_size() == 1)) {
-          removed_variable_names.push_back(replaced_graph_def.node(i).name());
-          if (replaced_graph_def.node(i).input_size() == 1) {
-            removed_variable_names.push_back(
-                replaced_graph_def.node(i).input(0));
+        // Revisit this to see if we can safely remove RestoreV2 nodes.
+        if ((replaced_graph_def.node(i).name() == name) &&
+            (replaced_graph_def.node(i).op() != "RestoreV2")) {
+          for (const auto& input : replaced_graph_def.node(i).input()) {
+            auto parsed_input = StringReplace(input, "^", "", true);
+            refs[parsed_input] -= 1;
+            if (refs[parsed_input] == 0) {
+              removed_node_names.push_back(parsed_input);
+            }
           }
           replaced_graph_def.mutable_node()->SwapElements(
               i, replaced_graph_def.node_size() - 1);
           replaced_graph_def.mutable_node()->RemoveLast();
           continue;
         }
+        int j = 0;
+        bool deleted_inputs = false;
+        while (j < replaced_graph_def.node(i).input_size()) {
+          if (replaced_graph_def.node(i).input(j) == name ||
+              replaced_graph_def.node(i).input(j) == ("^" + name)) {
+            TF_RETURN_IF_ERROR(
+                RemoveInputAtIndex(replaced_graph_def.mutable_node(i), j));
+            deleted_inputs = true;
+            continue;
+          }
+          j++;
+        }
+        if (deleted_inputs) {
+          if (replaced_graph_def.node(i).op() == "ConcatV2") {
+            if (replaced_graph_def.node(i).input_size() > 2) {
+              SetNodeAttr("N", replaced_graph_def.node(i).input_size() - 1,
+                          replaced_graph_def.mutable_node(i));
+            } else if (replaced_graph_def.node(i).input_size() == 2) {
+              if (refs[replaced_graph_def.node(i).input(1)] != 1) {
+                return errors::Internal(
+                    "Expect axis tensor of ConcatV2 node to only be referenced "
+                    "once.");
+              }
+              refs[replaced_graph_def.node(i).input(1)] -= 1;
+              removed_node_names.push_back(replaced_graph_def.node(i).input(1));
+              replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
+              replaced_graph_def.mutable_node(i)->mutable_attr()->erase("N");
+              replaced_graph_def.mutable_node(i)->set_op("Identity");
+            } else {
+              return errors::Internal(
+                  "ConcatV2 should have at least two elements");
+            }
+          }
+          if ((replaced_graph_def.node(i).op() == "Assign" ||
+               replaced_graph_def.node(i).op() == "Reshape" ||
+               replaced_graph_def.node(i).op() == "Equal" ||
+               replaced_graph_def.node(i).op() == "Mean" ||
+               replaced_graph_def.node(i).op() == "ScalarSummary") &&
+              replaced_graph_def.node(i).input_size() == 1) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
+          }
+          if (!replaced_graph_def.node(i).input_size()) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
+          }
+        }
         i++;
       }
     }
@@ -485,17 +582,22 @@ Status SparsifyGather(const GraphDef& input_graph_def,
     };
   // clang-format on
 
+  GraphDef cleaned_input_graph_def;
+  RemoveAttributes(input_graph_def, {"_output_shapes"},
+                   &cleaned_input_graph_def);
+
   GraphDef temp_output;
 
   std::unique_ptr<BundleReader> ckpt_reader;
   TF_RETURN_IF_ERROR(InitializeCheckpointReader(context, &ckpt_reader));
 
   std::unique_ptr<std::unordered_map<string, string> > shapes_and_slices;
-  TF_RETURN_IF_ERROR(ObtainVariableInfo(input_graph_def, &shapes_and_slices));
+  TF_RETURN_IF_ERROR(
+      ObtainVariableInfo(cleaned_input_graph_def, &shapes_and_slices));
 
-  TF_RETURN_IF_ERROR(SparsifyGatherInternal(input_graph_def, shapes_and_slices,
-                                            context, gather_pattern,
-                                            ckpt_reader, &temp_output));
+  TF_RETURN_IF_ERROR(SparsifyGatherInternal(
+      cleaned_input_graph_def, shapes_and_slices, context, gather_pattern,
+      ckpt_reader, &temp_output));
 
   TF_RETURN_IF_ERROR(SparsifyGatherInternal(temp_output, shapes_and_slices,
                                             context, gather_v2_pattern,
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 000568a0cc9..d41321c9a6d 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -71,7 +71,7 @@ class SparsifyGatherTest : public ::testing::Test {
   }
 
   void TestSinglePartition(bool gather_v2, bool include_shared_init,
-                           bool test_variable,
+                           bool test_variable, bool test_kept_concat,
                            const string& shared_init_name = "group_deps") {
     GraphDef graph_def;
 
@@ -80,6 +80,8 @@ class SparsifyGatherTest : public ::testing::Test {
     // Build the graph.
     NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def);
     NodeDef* w_node;
+    NodeDef* zeros_const;
+    NodeDef* zeros_shape;
     NodeDef* zeros_node;
     NodeDef* assign_node;
 
@@ -92,19 +94,27 @@ class SparsifyGatherTest : public ::testing::Test {
     } else {
       w_node = CreateNode("w/part_1", "VariableV2", {}, &graph_def);
 
-      zeros_node =
-          CreateNode("w/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape = CreateNode("w/part_1/Initializer/zeros/shape_as_tensor",
+                               "Const", {}, &graph_def);
+      zeros_const = CreateNode("w/part_1/Initializer/zeros/Const", "Const", {},
+                               &graph_def);
+      zeros_node = CreateNode("w/part_1/Initializer/zeros", "Fill",
+                              {zeros_shape, zeros_const}, &graph_def);
       assign_node = CreateNode("w/part_1/Assign", "Assign",
                                {w_node, zeros_node}, &graph_def);
 
       NodeDef* save_const_node =
           CreateNode("save/Const", "Const", {}, &graph_def);
 
+      Tensor tensor_names_values(DT_STRING, TensorShape({1}));
+      test::FillValues<string>(&tensor_names_values, {"w"});
       NodeDef* tensor_names_node =
           CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
+      SetNodeTensorAttr<string>("value", tensor_names_values,
+                                tensor_names_node);
+
       NodeDef* tensor_shapes_slices_node = CreateNode(
           "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
-
       Tensor shapes_slices_val(DT_STRING, TensorShape({1}));
       shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
       SetNodeTensorAttr<string>("value", shapes_slices_val,
@@ -133,6 +143,26 @@ class SparsifyGatherTest : public ::testing::Test {
       }
     }
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_input_node =
+        CreateNode("concat/input/node", "Const", {}, &graph_def);
+    NodeDef* concat_node = nullptr;
+    if (!test_kept_concat) {
+      concat_node = CreateNode(
+          "concat/node", "ConcatV2",
+          {identity_node, concat_input_node, concat_axis_node}, &graph_def);
+      SetNodeAttr("N", 2, concat_node);
+    } else {
+      NodeDef* concat_input_node_2 =
+          CreateNode("concat/input/node_2", "Const", {}, &graph_def);
+      concat_node = CreateNode("concat/node", "ConcatV2",
+                               {identity_node, concat_input_node,
+                                concat_input_node_2, concat_axis_node},
+                               &graph_def);
+      SetNodeAttr("N", 3, concat_node);
+    }
+
     // Run the op.
     GraphDef result;
     TransformFuncContext context;
@@ -151,12 +181,32 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Assign"));
 
     EXPECT_EQ(1, node_lookup.count("ids"));
     EXPECT_EQ("Const", node_lookup.at("ids")->op());
 
+    EXPECT_EQ(1, node_lookup.count("concat/node"));
+
+    if (!test_kept_concat) {
+      EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("Identity", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(1, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+    } else {
+      EXPECT_EQ(1, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("ConcatV2", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(3, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+      EXPECT_EQ("concat/input/node_2", node_lookup.at("concat/node")->input(1));
+      EXPECT_EQ("linear/concat/axis", node_lookup.at("concat/node")->input(2));
+      EXPECT_EQ(2, node_lookup.at("concat/node")->attr().at("N").i());
+    }
+
     EXPECT_EQ(1, node_lookup.count("w/part_1/indices"));
     EXPECT_EQ("Const", node_lookup.at("w/part_1/indices")->op());
     Tensor expected_indices_tensor(DT_INT64, TensorShape({3}));
@@ -247,7 +297,11 @@ class SparsifyGatherTest : public ::testing::Test {
     // Two partitions
     NodeDef* w_node1;
     NodeDef* w_node2;
+    NodeDef* zeros_const1;
+    NodeDef* zeros_shape1;
     NodeDef* zeros_node1;
+    NodeDef* zeros_const2;
+    NodeDef* zeros_shape2;
     NodeDef* zeros_node2;
     NodeDef* assign_node1;
     NodeDef* assign_node2;
@@ -260,51 +314,53 @@ class SparsifyGatherTest : public ::testing::Test {
       SetNodeTensorAttr<float>("value", weights, w_node1);
       SetNodeTensorAttr<float>("value", weights, w_node2);
     } else {
+      NodeDef* save_const_node =
+          CreateNode("save/Const", "Const", {}, &graph_def);
+
+      NodeDef* tensor_names_node =
+          CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
+      Tensor tensor_names_values(DT_STRING, TensorShape({2}));
+      test::FillValues<string>(&tensor_names_values, {"w1", "w2"});
+      SetNodeTensorAttr<string>("value", tensor_names_values,
+                                tensor_names_node);
+
+      NodeDef* tensor_shapes_slices_node = CreateNode(
+          "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
+      Tensor shapes_slices_val(DT_STRING, TensorShape({2}));
+      shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
+      shapes_slices_val.flat<string>()(1) = "4 1 0,4:0,1";
+      SetNodeTensorAttr<string>("value", shapes_slices_val,
+                                tensor_shapes_slices_node);
+
+      NodeDef* restore_node = CreateNode(
+          "save/RestoreV2", "RestoreV2",
+          {save_const_node, tensor_names_node, tensor_shapes_slices_node},
+          &graph_def);
+
       w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def);
-      zeros_node1 =
-          CreateNode("w1/part_1/Initializer/zeros", "Const", {}, &graph_def);
+
+      zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const1 = CreateNode("w1/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node1 = CreateNode("w1/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape1, zeros_const1}, &graph_def);
       assign_node1 = CreateNode("w1/part_1/Assign", "Assign",
                                 {w_node1, zeros_node1}, &graph_def);
 
-      NodeDef* save_const_node =
-          CreateNode("save/Const", "Const", {}, &graph_def);
-      NodeDef* tensor_names_node1 =
-          CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
-      NodeDef* tensor_shapes_slices_node1 = CreateNode(
-          "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
-
-      Tensor shapes_slices_val1(DT_STRING, TensorShape({1}));
-      shapes_slices_val1.flat<string>()(0) = "4 1 0,4:0,1";
-      SetNodeTensorAttr<string>("value", shapes_slices_val1,
-                                tensor_shapes_slices_node1);
-
-      NodeDef* restore_node1 = CreateNode(
-          "save/RestoreV2", "RestoreV2",
-          {save_const_node, tensor_names_node1, tensor_shapes_slices_node1},
-          &graph_def);
-      CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def);
+      CreateNode("save/Assign", "Assign", {w_node1, restore_node}, &graph_def);
 
       w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def);
-      zeros_node2 =
-          CreateNode("w2/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const2 = CreateNode("w2/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node2 = CreateNode("w2/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape2, zeros_const2}, &graph_def);
       assign_node2 = CreateNode("w2/part_1/Assign", "Assign",
                                 {w_node2, zeros_node2}, &graph_def);
 
-      NodeDef* tensor_names_node2 =
-          CreateNode("save/RestoreV2_1/tensor_names", "Const", {}, &graph_def);
-      NodeDef* tensor_shapes_slices_node2 = CreateNode(
-          "save/RestoreV2_1/shape_and_slices", "Const", {}, &graph_def);
-
-      Tensor shapes_slices_val2(DT_STRING, TensorShape({1}));
-      shapes_slices_val2.flat<string>()(0) = "4 1 0,4:0,1";
-      SetNodeTensorAttr<string>("value", shapes_slices_val2,
-                                tensor_shapes_slices_node2);
-
-      NodeDef* restore_node2 = CreateNode(
-          "save/RestoreV2_1", "RestoreV2",
-          {save_const_node, tensor_names_node2, tensor_shapes_slices_node2},
-          &graph_def);
-      CreateNode("save/Assign_1", "Assign", {w_node2, restore_node2},
+      CreateNode("save/Assign_1", "Assign", {w_node2, restore_node},
                  &graph_def);
 
       BundleWriter writer(Env::Default(), checkpoint_path);
@@ -322,6 +378,13 @@ class SparsifyGatherTest : public ::testing::Test {
     MakeGather("gather1", gather_v2, identity_node1, input_node, &graph_def);
     MakeGather("gather2", gather_v2, identity_node2, input_node, &graph_def);
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_node = CreateNode(
+        "concat/node", "ConcatV2",
+        {identity_node1, identity_node2, concat_axis_node}, &graph_def);
+    SetNodeAttr("N", 2, concat_node);
+
     // Shared init node
     if (include_shared_init) {
       if (!test_variable) {
@@ -350,8 +413,14 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w1/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Assign"));
+    EXPECT_EQ(0,
+              node_lookup.count("w2/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Assign"));
     EXPECT_EQ(1, node_lookup.count("ids"));
@@ -487,6 +556,9 @@ class SparsifyGatherTest : public ::testing::Test {
               node_lookup.at("gather2/LookupTableFind")->input(2));
     EXPECT_EQ("gather2/LookupTableFind", node_lookup.at("gather2")->input(0));
 
+    EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+    EXPECT_EQ(0, node_lookup.count("concat/node"));
+
     // Check control deps.
     EXPECT_EQ(2, node_lookup.at(shared_init_name)->input_size());
     EXPECT_NE(std::find(node_lookup.at(shared_init_name)->input().begin(),
@@ -522,18 +594,31 @@ class SparsifyGatherTest : public ::testing::Test {
 };
 
 TEST_F(SparsifyGatherTest, TestSinglePartition) {
-  TestSinglePartition(false, false, false);
-  TestSinglePartition(false, true, false);
-  TestSinglePartition(true, false, false);
-  TestSinglePartition(true, true, false);
-  TestSinglePartition(false, false, true);
-  TestSinglePartition(false, true, true);
-  TestSinglePartition(true, false, true);
-  TestSinglePartition(true, true, true);
-  TestSinglePartition(false, true, false, "shared_inits");
-  TestSinglePartition(true, true, false, "shared_inits");
-  TestSinglePartition(false, true, true, "shared_inits");
-  TestSinglePartition(true, true, true, "shared_inits");
+  TestSinglePartition(false, false, false, false);
+  TestSinglePartition(false, true, false, false);
+  TestSinglePartition(true, false, false, false);
+  TestSinglePartition(true, true, false, false);
+  TestSinglePartition(false, false, true, false);
+  TestSinglePartition(false, true, true, false);
+  TestSinglePartition(true, false, true, false);
+  TestSinglePartition(true, true, true, false);
+  TestSinglePartition(false, true, false, false, "shared_inits");
+  TestSinglePartition(true, true, false, false, "shared_inits");
+  TestSinglePartition(false, true, true, false, "shared_inits");
+  TestSinglePartition(true, true, true, false, "shared_inits");
+
+  TestSinglePartition(false, false, false, true);
+  TestSinglePartition(false, true, false, true);
+  TestSinglePartition(true, false, false, true);
+  TestSinglePartition(true, true, false, true);
+  TestSinglePartition(false, false, true, true);
+  TestSinglePartition(false, true, true, true);
+  TestSinglePartition(true, false, true, true);
+  TestSinglePartition(true, true, true, true);
+  TestSinglePartition(false, true, false, true, "shared_inits");
+  TestSinglePartition(true, true, false, true, "shared_inits");
+  TestSinglePartition(false, true, true, true, "shared_inits");
+  TestSinglePartition(true, true, true, true, "shared_inits");
 }
 
 TEST_F(SparsifyGatherTest, TestMultiPartition) {
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index dbc81599de8..7717d8d7de2 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -99,6 +99,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -114,6 +115,7 @@ genrule(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
@@ -134,6 +136,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -149,6 +152,7 @@ genrule(
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c87a3048fa5..bd2e3adb841 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -89,13 +89,20 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl/flags:LICENSE",
+        "@arm_neon_2_x86_sse//:LICENSE",
+        "@astor_archive//:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
+        "@com_google_absl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
+        "@flatbuffers//:LICENSE.txt",
+        "@gast_archive//:PKG-INFO",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
@@ -106,11 +113,15 @@ filegroup(
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
+        "@swig//:LICENSE",
+        "@termcolor_archive//:COPYING.txt",
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
@@ -152,9 +163,10 @@ sh_binary(
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
-            "//tensorflow/contrib/py2tf:py2tf_internal",
+            "//tensorflow/contrib/py2tf:py2tf",
             "//tensorflow/contrib/py2tf/converters:converters",
             "//tensorflow/contrib/py2tf/converters:test_lib",
+            "//tensorflow/contrib/py2tf/impl:impl",
             "//tensorflow/contrib/py2tf/pyct:pyct",
             "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis",
             "//tensorflow/contrib/receptive_field:receptive_field_pip",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 38a90073878..73d759eb130 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -65,7 +65,6 @@ BLACKLIST = [
     "//tensorflow/contrib/framework:checkpoint_ops_testdata",
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
     "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
-    "//tensorflow/contrib/py2tf:py2tf_internal",
     "//tensorflow/contrib/timeseries/examples:predict",
     "//tensorflow/contrib/timeseries/examples:multivariate",
     "//tensorflow/contrib/timeseries/examples:known_anomaly",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 62df6453fb5..d7fab2b93ac 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,16 +29,17 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.5.0-rc1'
+_VERSION = '1.5.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
+    'grpcio >= 1.8.6',
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard >= 0.4.0',
+    'tensorflow-tensorboard >= 1.5.0, < 1.6.0',
     'termcolor >= 1.1.0',
 ]
 
@@ -79,13 +80,13 @@ CONSOLE_SCRIPTS = [
     # is now declared by the tensorboard pip package. If we remove the
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
-    'tensorboard = tensorboard.main:main',
+    'tensorboard = tensorboard.main:run_main',
 ]
 # pylint: enable=line-too-long
 
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
-  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:main')
+  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
 
 TEST_PACKAGES = [
     'scipy >= 0.15.1',
diff --git a/tensorflow/tools/test/file_name_test.py b/tensorflow/tools/test/file_name_test.py
new file mode 100644
index 00000000000..16fb8a822d0
--- /dev/null
+++ b/tensorflow/tools/test/file_name_test.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Test that checks if we have any issues with case insensitive filesystems.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
+ERROR_MESSAGE = """
+Files with same name but different case detected in directory: {}
+"""
+
+
+def main():
+  # Make sure BASE_DIR ends with tensorflow.  If it doesn't, we probably
+  # computed the wrong directory.
+  if os.path.split(BASE_DIR)[-1] != 'tensorflow':
+    raise AssertionError(
+        "BASE_DIR = '%s' doesn't end with tensorflow" % BASE_DIR)
+
+  for dirpath, dirnames, filenames in os.walk(BASE_DIR, followlinks=True):
+    lowercase_directories = [x.lower() for x in dirnames]
+    lowercase_files = [x.lower() for x in filenames]
+
+    lowercase_dir_contents = lowercase_directories + lowercase_files
+    if len(lowercase_dir_contents) != len(set(lowercase_dir_contents)):
+      raise AssertionError(ERROR_MESSAGE.format(dirpath))
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index a953ed1b53d..3b4921bb983 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -136,7 +136,7 @@ def run_and_gather_logs(name, test_name, test_args,
   gpu_config = gpu_info_lib.gather_gpu_devices()
   if gpu_config:
     gpu_name = gpu_config[0].model
-    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100)", gpu_name)
+    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100|V100)", gpu_name)
     if gpu_short_name_match:
       gpu_short_name = gpu_short_name_match.group(0)
       test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f7d9075032e..eca744a920c 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -114,16 +114,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
      sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
      strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
+     build_file = str(Label("//third_party:com_google_absl.BUILD")),
   )
 
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/14e1418fcf12.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
       ],
-      sha256 = "2b526c6888639025323fd4f2600533c0f982d304ea48e4f1663e8066bd9f6368",
-      strip_prefix = "eigen-eigen-14e1418fcf12",
+      sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f",
+      strip_prefix = "eigen-eigen-2355b229ea4c",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
@@ -352,16 +353,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "protobuf_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
-      # TODO: remove patching when tensorflow stops linking same protos into
-      #       multiple shared libraries loaded in runtime by python.
-      #       This patch fixes a runtime crash when tensorflow is compiled
-      #       with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394)
-      patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   # We need to import the protobuf library under the names com_google_protobuf
@@ -370,21 +366,21 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "com_google_protobuf",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   tf_http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   tf_http_archive(
@@ -477,11 +473,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/11a2ca6eea8a7fe240a14c0c35fd2017341279be.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/299f8c346e1ab483463da5f02536ffd00b7ad9c6.tar.gz",
       ],
-      sha256 = "b5429ccf8d57273cb8489714f728c997cd720ec66fc2c0292422ab8f0e729ce0",
-      strip_prefix = "llvm-11a2ca6eea8a7fe240a14c0c35fd2017341279be",
+      sha256 = "0556bc6a85000c573d92fe00946b6418cbcd3844912696a81055e4768299dda4",
+      strip_prefix = "llvm-299f8c346e1ab483463da5f02536ffd00b7ad9c6",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
   )
 
@@ -560,6 +556,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:nccl.BUILD")),
   )
 
+  tf_http_archive(
+      name = "kafka",
+      urls = [
+          "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+          "https://github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+      ],
+      sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e",
+      strip_prefix = "librdkafka-0.11.1",
+      build_file = str(Label("//third_party:kafka/BUILD")),
+      patch_file = str(Label("//third_party/kafka:config.patch")),
+  )
+
   tf_http_archive(
       name = "aws",
       urls = [
diff --git a/third_party/com_google_absl.BUILD b/third_party/com_google_absl.BUILD
new file mode 100644
index 00000000000..8fca145f751
--- /dev/null
+++ b/third_party/com_google_absl.BUILD
@@ -0,0 +1,5 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache
+
+exports_files(["LICENSE"])
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index f6b8e6ddb05..824c97be60e 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -4,6 +4,8 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE.txt"])
+
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
diff --git a/third_party/gast.BUILD b/third_party/gast.BUILD
index 06db528ada2..4866982e1fd 100644
--- a/third_party/gast.BUILD
+++ b/third_party/gast.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # BSD 3-clause
 
-exports_files(["LICENSE"])
+exports_files(["PKG-INFO"])
 
 py_library(
     name = "gast",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 47cc4a5e69b..e23a533716a 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -835,7 +835,7 @@ def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
diff --git a/third_party/kafka/BUILD b/third_party/kafka/BUILD
new file mode 100644
index 00000000000..a61a9e1f6c2
--- /dev/null
+++ b/third_party/kafka/BUILD
@@ -0,0 +1,147 @@
+# Description:
+#   Kafka C/C++ (librdkafka) client library
+
+licenses(["notice"])  # 2-clause BSD license
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "kafka",
+    srcs = [
+        "config.h",
+        "src-cpp/ConfImpl.cpp",
+        "src-cpp/ConsumerImpl.cpp",
+        "src-cpp/HandleImpl.cpp",
+        "src-cpp/KafkaConsumerImpl.cpp",
+        "src-cpp/MessageImpl.cpp",
+        "src-cpp/MetadataImpl.cpp",
+        "src-cpp/QueueImpl.cpp",
+        "src-cpp/RdKafka.cpp",
+        "src-cpp/TopicImpl.cpp",
+        "src-cpp/TopicPartitionImpl.cpp",
+        "src/crc32c.c",
+        "src/crc32c.h",
+        "src/lz4.c",
+        "src/lz4.h",
+        "src/lz4frame.c",
+        "src/lz4frame.h",
+        "src/lz4frame_static.h",
+        "src/lz4hc.c",
+        "src/lz4hc.h",
+        "src/lz4opt.h",
+        "src/queue.h",
+        "src/rd.h",
+        "src/rdaddr.c",
+        "src/rdaddr.h",
+        "src/rdatomic.h",
+        "src/rdavg.h",
+        "src/rdavl.c",
+        "src/rdavl.h",
+        "src/rdbuf.c",
+        "src/rdbuf.h",
+        "src/rdcrc32.h",
+        "src/rddl.h",
+        "src/rdendian.h",
+        "src/rdgz.c",
+        "src/rdgz.h",
+        "src/rdinterval.h",
+        "src/rdkafka.c",
+        "src/rdkafka.h",
+        "src/rdkafka_assignor.c",
+        "src/rdkafka_assignor.h",
+        "src/rdkafka_broker.c",
+        "src/rdkafka_broker.h",
+        "src/rdkafka_buf.c",
+        "src/rdkafka_buf.h",
+        "src/rdkafka_cgrp.c",
+        "src/rdkafka_cgrp.h",
+        "src/rdkafka_conf.c",
+        "src/rdkafka_conf.h",
+        "src/rdkafka_event.h",
+        "src/rdkafka_feature.c",
+        "src/rdkafka_feature.h",
+        "src/rdkafka_int.h",
+        "src/rdkafka_interceptor.c",
+        "src/rdkafka_interceptor.h",
+        "src/rdkafka_lz4.c",
+        "src/rdkafka_lz4.h",
+        "src/rdkafka_metadata.c",
+        "src/rdkafka_metadata.h",
+        "src/rdkafka_metadata_cache.c",
+        "src/rdkafka_msg.c",
+        "src/rdkafka_msg.h",
+        "src/rdkafka_msgset.h",
+        "src/rdkafka_msgset_reader.c",
+        "src/rdkafka_msgset_writer.c",
+        "src/rdkafka_offset.c",
+        "src/rdkafka_offset.h",
+        "src/rdkafka_op.c",
+        "src/rdkafka_op.h",
+        "src/rdkafka_partition.c",
+        "src/rdkafka_partition.h",
+        "src/rdkafka_pattern.c",
+        "src/rdkafka_pattern.h",
+        "src/rdkafka_proto.h",
+        "src/rdkafka_queue.c",
+        "src/rdkafka_queue.h",
+        "src/rdkafka_range_assignor.c",
+        "src/rdkafka_request.c",
+        "src/rdkafka_request.h",
+        "src/rdkafka_roundrobin_assignor.c",
+        "src/rdkafka_sasl.c",
+        "src/rdkafka_sasl.h",
+        "src/rdkafka_sasl_int.h",
+        "src/rdkafka_sasl_plain.c",
+        "src/rdkafka_subscription.c",
+        "src/rdkafka_subscription.h",
+        "src/rdkafka_timer.c",
+        "src/rdkafka_timer.h",
+        "src/rdkafka_topic.c",
+        "src/rdkafka_topic.h",
+        "src/rdkafka_transport.c",
+        "src/rdkafka_transport.h",
+        "src/rdkafka_transport_int.h",
+        "src/rdlist.c",
+        "src/rdlist.h",
+        "src/rdlog.c",
+        "src/rdlog.h",
+        "src/rdports.c",
+        "src/rdports.h",
+        "src/rdposix.h",
+        "src/rdrand.c",
+        "src/rdrand.h",
+        "src/rdregex.c",
+        "src/rdregex.h",
+        "src/rdstring.c",
+        "src/rdstring.h",
+        "src/rdsysqueue.h",
+        "src/rdtime.h",
+        "src/rdtypes.h",
+        "src/rdunittest.c",
+        "src/rdunittest.h",
+        "src/rdvarint.c",
+        "src/rdvarint.h",
+        "src/snappy.c",
+        "src/snappy.h",
+        "src/tinycthread.c",
+        "src/tinycthread.h",
+        "src/xxhash.c",
+        "src/xxhash.h",
+    ],
+    hdrs = [
+        "config.h",
+    ],
+    defines = [
+    ],
+    includes = [
+        "src",
+        "src-cpp",
+    ],
+    linkopts = [
+        "-lpthread",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@boringssl//:ssl",
+    ],
+)
diff --git a/third_party/kafka/config.patch b/third_party/kafka/config.patch
new file mode 100644
index 00000000000..fa5c2d35b40
--- /dev/null
+++ b/third_party/kafka/config.patch
@@ -0,0 +1,44 @@
+diff -Naur a/config.h b/config.h
+--- a/config.h	1970-01-01 00:00:00.000000000 +0000
++++ b/config.h	2017-10-28 00:57:03.316957390 +0000
+@@ -0,0 +1,40 @@
++#pragma once
++#define WITHOUT_OPTIMIZATION 0
++#define ENABLE_DEVEL 0
++#define ENABLE_REFCNT_DEBUG 0
++#define ENABLE_SHAREDPTR_DEBUG 0
++
++#define HAVE_ATOMICS_32 1
++#define HAVE_ATOMICS_32_SYNC 1
++
++#if (HAVE_ATOMICS_32)
++# if (HAVE_ATOMICS_32_SYNC)
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++#define HAVE_ATOMICS_64 1
++#define HAVE_ATOMICS_64_SYNC 1
++
++#if (HAVE_ATOMICS_64)
++# if (HAVE_ATOMICS_64_SYNC)
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++
++#define WITH_ZLIB 1
++#define WITH_LIBDL 1
++#define WITH_PLUGINS 0
++#define WITH_SNAPPY 1
++#define WITH_SOCKEM 1
++#define WITH_SSL 1
++#define WITH_SASL 0
++#define WITH_SASL_SCRAM 0
++#define WITH_SASL_CYRUS 0
++#define HAVE_REGEX 1
++#define HAVE_STRNDUP 1
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 5344525ba8b..a9e1341a03c 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -670,6 +670,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "aggressive_inst_combine",
+    srcs = glob([
+        "lib/Transforms/AggressiveInstCombine/*.c",
+        "lib/Transforms/AggressiveInstCombine/*.cpp",
+        "lib/Transforms/AggressiveInstCombine/*.inc",
+        "lib/Transforms/AggressiveInstCombine/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/AggressiveInstCombine/*.h",
+        "include/llvm/Transforms/AggressiveInstCombine/*.def",
+        "include/llvm/Transforms/AggressiveInstCombine/*.inc",
+    ]),
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
 cc_library(
     name = "analysis",
     srcs = glob([
@@ -1405,6 +1427,7 @@ cc_library(
         "include/llvm/Transforms/IPO/*.inc",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":bit_reader",
         ":bit_writer",
@@ -1931,6 +1954,7 @@ cc_library(
         "include/llvm/Transforms/IPO/SCCP.h",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":config",
         ":core",
diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index e2cdec40295..3a8e7a10b43 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # BSD
 
-exports_files(["COPYING"])
+exports_files(["LICENCE"])
 
 cc_library(
     name = "pcre",
diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch
deleted file mode 100644
index af74798f067..00000000000
--- a/third_party/protobuf/add_noinlines.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc
---- a/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-02-10 23:55:34.000000000 +0100
-+++ b/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-03-21 13:41:46.931979154 +0100
-@@ -557,7 +557,7 @@
-         "      $metadata$, $enum_descriptors$, $service_descriptors$);\n"
-         "}\n"
-         "\n"
--        "void protobuf_AssignDescriptorsOnce() {\n"
-+        "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n"
-         "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-         "  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n"
-         "}\n"
-@@ -656,7 +656,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void InitDefaults() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n"
-       "}\n");
-@@ -737,7 +737,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void AddDescriptors() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n"
-       "}\n");
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index c16eb3a12a8..954f21f5f8f 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -118,7 +118,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 11e9c842d2f..aa178fa8cab 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -27,7 +27,7 @@ def _wrap_bash_cmd(ctx, cmd):
     bazel_sh = _get_env_var(ctx, "BAZEL_SH")
     if not bazel_sh:
       fail("BAZEL_SH environment variable is not set")
-    cmd = [bazel_sh, "-c", " ".join(cmd)]
+    cmd = [bazel_sh, "-l", "-c", " ".join(cmd)]
   return cmd
 
 def _get_env_var(ctx, name):
diff --git a/third_party/termcolor.BUILD b/third_party/termcolor.BUILD
index 6000e3289de..655d7cb85e5 100644
--- a/third_party/termcolor.BUILD
+++ b/third_party/termcolor.BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # MIT
 
-exports_files(["LICENSE"])
+exports_files(["COPYING.txt"])
 
 py_library(
     name = "termcolor",