diff --git a/configure.py b/configure.py
index 186fdc9ddce..fcf359d061d 100644
--- a/configure.py
+++ b/configure.py
@@ -685,10 +685,12 @@ def set_tf_cunn_version(environ_cp):
       ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
       cudnn_path_from_ldconfig = run_shell([ldconfig_bin, '-p'])
       cudnn_path_from_ldconfig = re.search('.*libcudnn.so .* => (.*)',
-                                           cudnn_path_from_ldconfig).group(1)
-      if os.path.exists('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)):
-        cudnn_install_path = os.path.dirname(cudnn_path_from_ldconfig)
-        break
+                                           cudnn_path_from_ldconfig)
+      if cudnn_path_from_ldconfig:
+        cudnn_path_from_ldconfig = cudnn_path_from_ldconfig.group(1)
+        if os.path.exists('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)):
+          cudnn_install_path = os.path.dirname(cudnn_path_from_ldconfig)
+          break
 
     # Reset and Retry
     print(
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 20e5c573c6b..5b6a18b6a69 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -296,6 +296,7 @@ filegroup(
         "//tensorflow/contrib/ffmpeg/default:all_files",
         "//tensorflow/contrib/framework:all_files",
         "//tensorflow/contrib/fused_conv:all_files",
+        "//tensorflow/contrib/gan:all_files",
         "//tensorflow/contrib/graph_editor:all_files",
         "//tensorflow/contrib/grid_rnn:all_files",
         "//tensorflow/contrib/hooks:all_files",
@@ -323,6 +324,7 @@ filegroup(
         "//tensorflow/contrib/nn:all_files",
         "//tensorflow/contrib/opt:all_files",
         "//tensorflow/contrib/predictor:all_files",
+        "//tensorflow/contrib/receptive_field:all_files",
         "//tensorflow/contrib/reduce_slice_ops:all_files",
         "//tensorflow/contrib/remote_fused_graph/pylib:all_files",
         "//tensorflow/contrib/resampler:all_files",
@@ -342,6 +344,7 @@ filegroup(
         "//tensorflow/contrib/staging:all_files",
         "//tensorflow/contrib/stat_summarizer:all_files",
         "//tensorflow/contrib/stateless:all_files",
+        "//tensorflow/contrib/summary:all_files",
         "//tensorflow/contrib/tensor_forest:all_files",
         "//tensorflow/contrib/tensor_forest/hybrid:all_files",
         "//tensorflow/contrib/tensor_forest/kernels/v4:all_files",
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 604dfab148b..1822e235eba 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -45,8 +45,13 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "c_api",
-    srcs = ["c_api.cc"],
-    hdrs = ["c_api.h"],
+    srcs = [
+        "c_api.cc",
+        "c_api_function.cc",
+    ],
+    hdrs = [
+        "c_api.h",
+    ],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = select({
@@ -157,6 +162,21 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "c_api_function_test",
+    size = "small",
+    srcs = ["c_api_function_test.cc"],
+    deps = [
+        ":c_api",
+        ":c_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "while_loop_test",
     size = "small",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 07c8277a6f2..c454c94249b 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -165,22 +165,6 @@ void deallocate_buffer(void* data, size_t len, void* arg) {
   tensorflow::cpu_allocator()->DeallocateRaw(data);
 }
 
-Status MessageToBuffer(const tensorflow::protobuf::Message& in,
-                       TF_Buffer* out) {
-  if (out->data != nullptr) {
-    return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
-  }
-  const auto proto_size = in.ByteSizeLong();
-  void* buf = tensorflow::port::Malloc(proto_size);
-  in.SerializeToArray(buf, proto_size);
-  out->data = buf;
-  out->length = proto_size;
-  out->data_deallocator = [](void* data, size_t length) {
-    tensorflow::port::Free(data);
-  };
-  return Status::OK();
-}
-
 }  // namespace
 
 TF_Tensor::~TF_Tensor() { buffer->Unref(); }
@@ -559,6 +543,27 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
                       dimvec.size(), base, size, DeleteArray, base);
 }
 
+Status MessageToBuffer(const tensorflow::protobuf::Message& in,
+                       TF_Buffer* out) {
+  if (out->data != nullptr) {
+    return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
+  }
+  const size_t proto_size = in.ByteSizeLong();
+  void* buf = tensorflow::port::Malloc(proto_size);
+  if (buf == nullptr) {
+    return tensorflow::errors::ResourceExhausted(
+        "Failed to allocate memory to serialize message of type '",
+        in.GetTypeName(), "' and size ", proto_size);
+  }
+  in.SerializeToArray(buf, proto_size);
+  out->data = buf;
+  out->length = proto_size;
+  out->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+  return Status::OK();
+}
+
 // Helpers for loading a TensorFlow plugin (a .so file).
 Status LoadLibrary(const char* library_filename, void** result,
                    const void** buf, size_t* len);
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 43b50780137..ee110d88cea 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -357,6 +357,14 @@ typedef struct TF_Output {
   int index;  // The index of the output within oper.
 } TF_Output;
 
+// TF_Function is a grouping of operations with defined inputs and outputs.
+// Once created and added to graphs, functions can be invoked by creating an
+// operation whose operation type matches the function name.
+typedef struct TF_Function TF_Function;
+
+// Function definition options. TODO(iga): Define and implement
+typedef struct TF_FunctionOptions TF_FunctionOptions;
+
 // Sets the shape of the Tensor referenced by `output` in `graph` to
 // the shape described by `dims` and `num_dims`.
 //
@@ -914,6 +922,15 @@ TF_CAPI_EXPORT extern void TF_GraphImportGraphDef(
     TF_Graph* graph, const TF_Buffer* graph_def,
     const TF_ImportGraphDefOptions* options, TF_Status* status);
 
+// Add `function` to graph `g`. Once `function` is added to `g`,
+// it can be called by creating an operation using the function's name.
+//
+// If successful, status is set to OK and function is added to g
+// Otherwise, status is set to the encountered error and g is unmodified
+TF_CAPI_EXPORT extern void TF_GraphAddFunction(TF_Graph* g,
+                                               const TF_Function* function,
+                                               TF_Status* status);
+
 // Note: The following function may fail on very large protos in the future.
 
 TF_CAPI_EXPORT extern void TF_OperationToNodeDef(TF_Operation* oper,
@@ -1001,6 +1018,105 @@ TF_CAPI_EXPORT void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny,
                                     TF_Output* x, int nx, TF_Output* dx,
                                     TF_Status* status, TF_Output* dy);
 
+// Create a TF_Function from a TF_Graph
+//
+// Params:
+//  fn_body - the graph whose operations (or subset of whose operations) will be
+//            converted to TF_Function.
+//  fn_name - the name of the new TF_Function. Should match the operation
+//            name (OpDef.name) regexp [A-Z][A-Za-z0-9_.\\-/]* and be distinct
+//            from other operation names (at least those registered in graphs
+//            where this function will be used).
+//            TODO(iga): Allow null in here and have C API come up with
+//            a unique name with high probability (similarly to
+//            _create_hash_str in function.py)
+//  num_opers - `num_opers` contains the number of elements in the `opers` array
+//              or a special value of -1 meaning that no array is given.
+//              The distinction between an empty array of operations and no
+//              array of operations is necessary to distinguish the case of
+//              creating a function with no body (e.g. identity or permutation)
+//              and the case of creating a function whose body contains all
+//              the nodes in the graph (except for the automatic skipping, see
+//              below).
+//  opers - Array of operations to become the body of the function or null.
+//          - If no array is given (`num_opers`  = -1), all the
+//          operations in `fn_body` will become part of the function
+//          except operations referenced in `inputs`. These operations
+//          must have a single output (these operations are typically
+//          placeholders created for the sole purpose of representing
+//          an input. We can relax this constraint if there are
+//          compelling use cases).
+//          - If an array is given (`num_opers` >= 0), all operations
+//          in it will become part of the function. In particular, no
+//          automatic skipping of dummy input operations is performed.
+//  ninputs - number of elements in `inputs` array
+//  inputs - array of TF_Outputs that specify the inputs to the function.
+//           If `ninputs` is zero (the function takes no inputs), `inputs`
+//           can be null. The names used for function inputs are normalized
+//           names of the operations (usually placeholders) pointed to by
+//           `inputs`. These operation names should start with a letter.
+//           Normalization will convert all letters to lowercase and
+//           non-alphanumeric characters to '_' to make resulting names match
+//           the "[a-z][a-z0-9_]*" pattern for operation argument names.
+//           `inputs` cannot contain the same tensor twice.
+//  noutputs - number of elements in `outputs` array
+//  outputs - array of TF_Outputs that specify the outputs of the function.
+//            If `noutputs` is zero (the function returns no outputs), `outputs`
+//            can be null. `outputs` can contain the same tensor more than once.
+//  output_names - The names of the function's outputs. `output_names` array
+//                 must either have the same length as `outputs`
+//                 (i.e. `noutputs`) or be null. In the former case,
+//                 the names should match the regular expression for ArgDef
+//                 names - "[a-z][a-z0-9_]*". In the latter case,
+//                 names for outputs will be generated automatically.
+//  opts - various options for the function, e.g. XLA's inlining control.
+//  status - Set to OK on success and an appropriate error on failure.
+//
+// Note that when the same TF_Output is listed as both an input and an output,
+// the corresponding function's output will equal to this input,
+// instead of the original node's output.
+//
+// Callers must also satisfy the following constraints:
+// - `inputs` cannot refer to TF_Outputs within a control flow context. For
+//   example, one cannot use the output of "switch" node as input.
+// - No TF_Output of a function (inside any of `inputs`, `outputs`, `fn_body`)
+//   is allowed to have a reference type. Reference types are not exposed
+//   through C API and are being deprecated.
+// - Every node in the function's body must have all of its inputs (including
+//   control inputs). In other words, for every node in the body, each input
+//   must be either listed in `inputs` or must come from another node in
+//   the body. In particular, it is an error to have a control edge going from
+//   a node outside of the body into a node in the body. This applies to control
+//   edges going from nodes referenced in `inputs` to nodes in the body when
+//   the former nodes are not in the body (automatically skipped or not
+//   included in explicitly specified body).
+//
+// Returns:
+//  On successful, a newly created TF_Function instance. It must be deleted by
+//  calling TF_DeleteFunction.
+//
+//  On failure, null.
+//
+// TODO(iga): Add input_names argument and get output_names working (they are
+// currently ignored)
+TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunction(
+    const TF_Graph* fn_body, const char* fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    const TF_FunctionOptions* opts, TF_Status* status);
+
+// Write out a serialized representation of `func` (as a FunctionDef protocol
+// message) to `output_func_def` (allocated by TF_NewBuffer()).
+// `output_func_def`'s underlying buffer will be freed when TF_DeleteBuffer()
+// is called.
+//
+// May fail on very large graphs in the future.
+TF_CAPI_EXPORT extern void TF_FunctionToFunctionDef(TF_Function* func,
+                                                    TF_Buffer* output_func_def,
+                                                    TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_DeleteFunction(TF_Function*);
+
 // TODO(josh11b): Register OpDef, available to all operations added
 // to this graph.
 
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
new file mode 100644
index 00000000000..b4c6397d0b4
--- /dev/null
+++ b/tensorflow/c/c_api_function.cc
@@ -0,0 +1,496 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api_internal.h"
+
+#include <algorithm>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace {
+
+// Class that maintains a one-to-one original node name -> new node name
+// mapping. We normalize the names used as input and output arguments to match
+// regexp "[a-z][a-z0-9_]*" specified in definition of ArgDef.name.
+// Once we rename them, we risk creating a name collision with the other
+// node names, so if necessary we add a suffix to make
+// names unique. If we have an input named "A" and a node in the function
+// body named "a", they will be renamed to "a" and "a_0".
+class NodeNameMapping {
+ public:
+  NodeNameMapping() = default;
+
+  // Normalize the input/output name and make it unique.
+  string GetIOName(const string& name);
+
+  // Make the node name unique.
+  string Uniquify(const string& name);
+
+  // Look up how a node name was previously normalized/uniquified.
+  // Returns empty if name was never seen.
+  string Lookup(const string& name) const;
+
+ private:
+  string UniquifyHelper(const string& name) const;
+  static string Normalize(string name);
+
+  // The normalized/uniquified names already used as
+  // input names (in signature), output names (in signature), and node names
+  // (in node_def).
+  // This is a superset of values in name_mapping_.
+  std::unordered_set<string> used_names_;
+  // Mapping from original node name from the graph to the normalized
+  // and uniqified version of it.
+  std::unordered_map<string, string> name_mapping_;
+};
+
+string NodeNameMapping::Normalize(string name) {
+  // Convert letters to lowercase and non-alphanumeric characters to '_'.
+  if (name.empty()) return "unknown";
+  const int n = name.size();
+  for (int i = 0; i < n; ++i) {
+    char c = name[i];
+    if (isalnum(c)) {
+      if (isupper(c)) {
+        name[i] = tolower(c);
+      }
+    } else {
+      name[i] = '_';
+    }
+  }
+
+  // Find the first letter and start with it.
+  int i = 0;
+  for (; i < n; ++i) {
+    if (isalpha(name[i])) break;
+  }
+
+  // Return "unknown" if none of the name's chars were letters.
+  return i == n ? "unknown" : name.substr(i);
+}
+
+string NodeNameMapping::UniquifyHelper(const string& name) const {
+  // If the name hasn't been used yet, use it as-is.
+  if (used_names_.find(name) == used_names_.end()) return name;
+  // Add a suffix to name to make it unique.
+  for (int i = 0;; ++i) {
+    const string candidate = strings::StrCat(name, "_", i);
+    if (used_names_.find(candidate) == used_names_.end()) return candidate;
+  }
+}
+
+string NodeNameMapping::GetIOName(const string& name) {
+  const string& input_name = UniquifyHelper(Normalize(name));
+  // Record that we used this name, but don't add it to name_mapping_
+  // since this name is not for a node.
+  used_names_.insert(input_name);
+  return input_name;
+}
+
+string NodeNameMapping::Uniquify(const string& name) {
+  const string uniqued = UniquifyHelper(name);
+  name_mapping_[name] = uniqued;
+  used_names_.insert(uniqued);
+  return uniqued;
+}
+
+string NodeNameMapping::Lookup(const string& name) const {
+  const auto iter = name_mapping_.find(name);
+  if (iter == name_mapping_.end()) return string();
+  return iter->second;
+}
+
+Status ValidateNoRefOutputs(const Node* node) {
+  for (int i = 0; i < node->num_outputs(); ++i) {
+    const DataType& dt = node->output_type(i);
+    if (IsRefType(dt)) {
+      return errors::InvalidArgument("Output ", i, " of node '", node->name(),
+                                     "' has a reference "
+                                     "type ",
+                                     DataTypeString(dt));
+    }
+  }
+  return Status::OK();
+}
+
+Status FillFunctionBody(
+    const string& fn_name, const NodeNameMapping& node_names,
+    const std::vector<const Node*>& body_nodes,
+    const std::unordered_map<string, string>& tensor_renaming,
+    FunctionDef* fdef) {
+  std::vector<const Edge*> in_edges;
+  std::vector<const Edge*> control_edges;
+  for (const Node* node : body_nodes) {
+    NodeDef* node_def = fdef->add_node_def();
+    // First, copy the node_def as is. We will patch it next.
+    *node_def = node->def();
+    if (!node->assigned_device_name().empty()) {
+      node_def->set_device(node->assigned_device_name());
+    }
+    node_def->set_name(node_names.Lookup(node->name()));
+
+    // Input names must be set based on nested names in tensor_renaming.
+    // Clear the flat input names we got from the original node_def
+    // from the graph.
+    node_def->clear_input();
+
+    // Collect regular and control inputs. Regular inputs are indexed
+    // by the index at which they come into the `node`. Control inputs
+    // don't follow any order.
+    in_edges.clear();
+    in_edges.resize(node->num_inputs(), nullptr);
+    control_edges.clear();
+    for (const Edge* edge : node->in_edges()) {
+      if (edge->src()->IsSource()) continue;
+      if (edge->IsControlEdge()) {
+        control_edges.push_back(edge);
+      } else {
+        in_edges[edge->dst_input()] = edge;
+      }
+    }
+
+    // Add regular inputs.
+    for (size_t i = 0; i < in_edges.size(); ++i) {
+      const Edge* edge = in_edges[i];
+      string original_input_name;
+      if (edge == nullptr) {
+        // A backedge might not appear as a regular Edge, but be only present
+        // in the node_def. Such edges are referred to as requested_inputs().
+        if (i >= node->requested_inputs().size()) {
+          return errors::InvalidArgument(
+              "Graph to be converted to function appears to be malformed. ",
+              "Node ", node->name(), " is missing input edge ", i);
+        }
+        original_input_name =
+            ParseTensorName(node->requested_inputs()[i]).ToString();
+      } else {
+        original_input_name =
+            strings::StrCat(edge->src()->name(), ":", edge->src_output());
+      }
+
+      const auto iter = tensor_renaming.find(original_input_name);
+      if (iter == tensor_renaming.end()) {
+        return errors::InvalidArgument(
+            "Input ", i, ", '", original_input_name, "', of node '",
+            node->name(), "' in function '", fn_name,
+            "' is not available. You might need to include it in inputs "
+            "or include its source node in the body");
+      }
+      node_def->add_input(iter->second);
+    }
+
+    // Add control inputs.
+    for (const Edge* edge : control_edges) {
+      // Add this control input only if the src node is in the body.
+      const string normalized = node_names.Lookup(edge->src()->name());
+      // If we did not find a name for the source of control edge, this
+      // source must be outside of the body. Raise an error.
+      if (normalized.empty()) {
+        return errors::InvalidArgument(
+            "The source of control edge ", edge->DebugString(),
+            " is not in the body. Encountered while creating function '",
+            fn_name, "'");
+      }
+      node_def->add_input(strings::StrCat("^", normalized));
+    }
+  }
+  return Status::OK();
+}
+
+// Graph to FunctionDef conversion. This code is closely modeled on the Python
+// code in third_party/tensorflow/python/framework/function.py.
+Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
+                          const std::vector<const Node*>& body_nodes,
+                          const std::vector<OutputTensor>& inputs,
+                          const std::vector<OutputTensor>& outputs,
+                          const std::vector<string>& output_names,
+                          FunctionDef* fdef) {
+  fdef->mutable_signature()->set_name(fn_name);
+
+  // Keep track of names we used and how we normalized them.
+  NodeNameMapping node_names;
+
+  // Mapping from original names of tensors (i.e. "<node_name>:<idx>") to the
+  // name we used in the function:
+  //  - For input tensors:
+  //    {flat_tensor_name -> normalized_name_of_src_node}
+  //    e.g. {In:3 -> in}
+  //  - For tensors produced by nodes in function's body:
+  //    {flat_tensor_name -> nested_tensor_name}
+  //    e.g. {Add:3 -> add_0:z:1}
+  std::unordered_map<string, string> tensor_renaming;
+
+  // Fill inputs in function's signature.
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    const Node* node = inputs[i].node;
+    int idx = inputs[i].index;
+    OpDef::ArgDef* argdef = fdef->mutable_signature()->add_input_arg();
+    argdef->set_type(node->output_type(idx));
+    const string& input_name = node_names.GetIOName(node->name());
+    argdef->set_name(input_name);
+    tensor_renaming[strings::StrCat(node->name(), ":", idx)] = input_name;
+  }
+
+  // Fill outputs in function's signature.
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    const Node* node = outputs[i].node;
+    int idx = outputs[i].index;
+    OpDef::ArgDef* argdef = fdef->mutable_signature()->add_output_arg();
+    argdef->set_type(node->output_type(idx));
+    argdef->set_name(node_names.GetIOName(node->name()));
+  }
+
+  // Populate tensor_renaming and node_names.
+  // Generate the new output names for every node in the function.
+  // The NodeDefs in FunctionDefs use a different naming scheme for
+  // their inputs than the NodeDefs in a graph (see the comment for
+  // FunctionDef.node_def in function.proto). We do the
+  // graph tensor name -> function tensor name conversion for every
+  // possible input (i.e. every node's outputs) and store the result
+  // in tensor_renaming.
+  for (const Node* node : body_nodes) {
+    // Make sure node_name does not collide with an input or output name.
+    const string& node_name = node_names.Uniquify(node->name());
+    // For each output_arg in the op_def, the output_ranges
+    // map will have [start, end] range of indices that this arg produces
+    // among all the output tensors of this op.
+    NameRangeMap output_ranges;
+    TF_RETURN_IF_ERROR(
+        NameRangesForNode(*node, node->op_def(), nullptr, &output_ranges));
+    for (const auto& output : output_ranges) {
+      const string& output_name = output.first;
+      int index_start = output.second.first;
+      int index_end = output.second.second;
+      for (int i = index_start; i < index_end; ++i) {
+        const string& original_name = strings::StrCat(node->name(), ":", i);
+        const string& new_name =
+            strings::StrCat(node_name, ":", output_name, ":", i - index_start);
+        // Record the mapping if this tensor is not already mapped.
+        // Tensor can be already mapped if it is used as an input.
+        if (tensor_renaming.find(original_name) == tensor_renaming.end()) {
+          tensor_renaming[original_name] = new_name;
+        }
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(
+      FillFunctionBody(fn_name, node_names, body_nodes, tensor_renaming, fdef));
+
+  // Remap return values.
+  for (int r = 0; r < fdef->signature().output_arg_size(); ++r) {
+    const string& ret_name = fdef->signature().output_arg(r).name();
+
+    // We convert this flat tensor name to the nested value
+    // (e.g. `add:z:1`) that we stored in tensor_renaming.
+    const string& return_value =
+        strings::StrCat(outputs[r].node->name(), ":", outputs[r].index);
+    const auto iter = tensor_renaming.find(return_value);
+    if (iter == tensor_renaming.end()) {
+      return errors::InvalidArgument(
+          "TF_Output ", return_value, " is neither in the function body ",
+          "nor among function inputs. Encountered while creating function '",
+          fn_name, "'");
+    }
+    (*fdef->mutable_ret())[ret_name] = iter->second;
+  }
+
+  return Status::OK();
+}
+
+// Converts `ninputs` and `inputs` into `inputs_tensors` and `input_nodes` and
+// does various checks while doing so. `input_nodes` will contain the same
+// information as input_tensors just in a different structure to make
+// following processing easier. TODO(iga): Simplify this nested structure.
+Status ProcessInputs(
+    const TF_Graph* fn_body, const char* fn_name, int ninputs,
+    const TF_Output* inputs, std::vector<OutputTensor>* input_tensors,
+    std::unordered_map<const Node*, std::vector<int>>* input_nodes)
+    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+  input_tensors->reserve(ninputs);
+  for (int i = 0; i < ninputs; ++i) {
+    const Node& node = inputs[i].oper->node;
+    int idx = inputs[i].index;
+
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        fn_body->graph.IsValidOutputTensor(&node, idx),
+        "Encountered while processing input ", i, " into function '", fn_name,
+        "'");
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNoRefOutputs(&node),
+                                    "Encountered while processing input ", i,
+                                    " into function '", fn_name, "'");
+
+    input_tensors->emplace_back(&node, idx);
+
+    const auto& iter = input_nodes->find(&node);
+    if (iter == input_nodes->end()) {
+      input_nodes->insert({&node, {idx}});
+    } else {
+      auto& indices = iter->second;
+      if (std::find(indices.begin(), indices.end(), idx) != indices.end()) {
+        return errors::InvalidArgument(
+            "TF_Output ", node.name(), ":", idx,
+            " appears more than once in the input list");
+      }
+      indices.push_back(idx);
+    }
+  }
+  return Status::OK();
+}
+
+// Converts `noutputs` and `outputs` into `outputs_tensors` and does various
+// checks while doing so.
+Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
+                      int noutputs, const TF_Output* outputs,
+                      std::vector<OutputTensor>* output_tensors)
+    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+  output_tensors->reserve(noutputs);
+  for (int i = 0; i < noutputs; ++i) {
+    const Node& node = outputs[i].oper->node;
+    int idx = outputs[i].index;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        fn_body->graph.IsValidOutputTensor(&node, idx),
+        "Encountered while processing output ", i, " from function '", fn_name,
+        "'");
+    output_tensors->emplace_back(&node, idx);
+  }
+  return Status::OK();
+}
+
+// Populates `body_nodes` with the nodes that will become function's body.
+// Performs various checks.
+Status ComputeBodyNodes(
+    const TF_Graph* fn_body, const char* fn_name, int num_opers,
+    const TF_Operation* const* opers,
+    const std::unordered_map<const Node*, std::vector<int>>& input_nodes,
+    std::vector<const Node*>* body_nodes)
+    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+  if (num_opers == -1) {
+    for (const Node* node : fn_body->graph.op_nodes()) {
+      const auto& iter = input_nodes.find(node);
+      if (iter == input_nodes.end()) {
+        // This node is not referenced in inputs. Add it to the body.
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNoRefOutputs(node),
+                                        "Encountered while creating function '",
+                                        fn_name, "'");
+        body_nodes->push_back(node);
+      } else {
+        // This node is referenced in inputs. Currently, we place an
+        // artificial restriction and require that when num_opers=-1, such
+        // nodes must have a single output.
+        if (node->num_outputs() != 1) {
+          return errors::InvalidArgument(
+              "When `num_opers` is set to -1, nodes referenced in `inputs` "
+              "must have a single output. Node ",
+              node->name(), " has ", node->num_outputs(),
+              " outputs. Encountered while creating function '", fn_name, "'");
+        }
+      }
+    }
+  } else {
+    body_nodes->reserve(num_opers);
+    for (int i = 0; i < num_opers; ++i) {
+      const Node* node = &opers[i]->node;
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNoRefOutputs(node),
+                                      "Encountered while creating function '",
+                                      fn_name, "'");
+      body_nodes->push_back(node);
+    }
+  }
+  return Status::OK();
+}
+
+}  // anonymous namespace
+}  // namespace tensorflow
+
+using tensorflow::Node;
+using tensorflow::string;
+
+TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
+                                int num_opers, const TF_Operation* const* opers,
+                                int ninputs, const TF_Output* inputs,
+                                int noutputs, const TF_Output* outputs,
+                                const char* const* output_names,
+                                const TF_FunctionOptions* opts,
+                                TF_Status* status) {
+  tensorflow::mutex_lock l(*const_cast<tensorflow::mutex*>(&fn_body->mu));
+
+  // Process inputs.
+  std::vector<tensorflow::OutputTensor> input_tensors;
+  std::unordered_map<const Node*, std::vector<int>> input_nodes;
+  status->status = tensorflow::ProcessInputs(fn_body, fn_name, ninputs, inputs,
+                                             &input_tensors, &input_nodes);
+  if (!status->status.ok()) return nullptr;
+
+  // Process outputs.
+  std::vector<tensorflow::OutputTensor> output_tensors;
+  status->status = tensorflow::ProcessOutputs(fn_body, fn_name, noutputs,
+                                              outputs, &output_tensors);
+  if (!status->status.ok()) return nullptr;
+
+  // Process output names.
+  std::vector<string> output_names_vec;
+  if (output_names) {
+    output_names_vec.reserve(noutputs);
+    for (int i = 0; i < noutputs; ++i) {
+      output_names_vec.push_back(string(output_names[i]));
+    }
+  }
+
+  // Compute body nodes.
+  std::vector<const Node*> body_nodes;
+  status->status = tensorflow::ComputeBodyNodes(
+      fn_body, fn_name, num_opers, opers, input_nodes, &body_nodes);
+  if (!status->status.ok()) return nullptr;
+
+  // Do the actual function creation.
+  TF_Function* tf_function = new TF_Function();
+  status->status = tensorflow::GraphToFunctionDef(
+      fn_body->graph, fn_name, body_nodes, input_tensors, output_tensors,
+      output_names_vec, tf_function->fdef_lib.add_function());
+  if (!status->status.ok()) {
+    TF_DeleteFunction(tf_function);
+    return nullptr;
+  }
+  return tf_function;
+}
+
+void TF_GraphAddFunction(TF_Graph* g, const TF_Function* function,
+                         TF_Status* status) {
+  tensorflow::mutex_lock l(g->mu);
+
+  // At the moment, we have only one function and no gradients in fdef_lib.
+  // This makes the following operation atomic.
+  // TODO(iga): Add an atomic version of AddFunctionLibrary when we support
+  // gradients
+  status->status = g->graph.AddFunctionLibrary(function->fdef_lib);
+}
+
+void TF_FunctionToFunctionDef(TF_Function* func, TF_Buffer* output_func_def,
+                              TF_Status* status) {
+  DCHECK_EQ(1, func->fdef_lib.function_size());
+  status->status = MessageToBuffer(func->fdef_lib.function(0), output_func_def);
+}
+
+void TF_DeleteFunction(TF_Function* function) { delete function; }
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
new file mode 100644
index 00000000000..c9dd38ea15f
--- /dev/null
+++ b/tensorflow/c/c_api_function_test.cc
@@ -0,0 +1,1039 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api.h"
+
+#include "tensorflow/c/c_test_util.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+// Specification for expected input/output and its type.
+// DataType value of DT_INVALID signifies that we don't want to
+// check the data type.
+typedef std::pair<string, DataType> IOSpec;
+
+std::vector<IOSpec> M(const std::initializer_list<string>& names) {
+  std::vector<IOSpec> v;
+  for (const string& name : names) {
+    v.push_back(IOSpec(name, DT_INVALID));
+  }
+  return v;
+}
+
+// Specification for an expected edge.
+// src is either:
+// - input name (as it appears in FunctionDef)
+// - name of output tensor (in nested "add:z:0" format)
+// dst is either:
+// - output name (as it appears in FunctionDef)
+// - <name_of_node>:<index_of_this_input_into_node> (this looks the same as
+//      output tensor naming, but it the index is actually an input index)
+struct EdgeSpec : public std::pair<string, string> {
+  typedef std::pair<string, string> Base;
+
+  // Inherit the set of constructors
+  using Base::pair;
+
+  string ToString() const { return strings::StrCat(first, "->", second); }
+};
+
+class CApiFunctionTest : public ::testing::Test {
+ protected:
+  CApiFunctionTest()
+      : s_(TF_NewStatus()),
+        func_graph_(TF_NewGraph()),
+        host_graph_(TF_NewGraph()),
+        func_(nullptr) {}
+
+  void SetUp() override {}
+
+  ~CApiFunctionTest() override {
+    TF_DeleteFunction(func_);
+    TF_DeleteGraph(host_graph_);
+    TF_DeleteGraph(func_graph_);
+    TF_DeleteStatus(s_);
+  }
+
+  void Run(const std::vector<std::pair<TF_Operation*, TF_Tensor*>>& inputs,
+           TF_Operation* output, int32_t expected_result) {
+    Run(inputs, {{output, 0}}, {expected_result});
+  }
+
+  // Run the host graph, which now contains a function and check that
+  // outputs are as expected.
+  // 'T' stands for 'tensor' since the outputs are tensors, not scalars.
+  void RunT(const std::vector<std::pair<TF_Operation*, TF_Tensor*>>& inputs,
+            std::initializer_list<TF_Output> outputs,
+            const std::vector<std::vector<int32_t>>& expected_results) {
+    // Create a session for this graph
+    CSession csession(host_graph_, s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+    // Run
+    csession.SetInputs(inputs);
+    csession.SetOutputs(outputs);
+    csession.Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+    // Check results
+    for (int i = 0; i < expected_results.size(); ++i) {
+      TF_Tensor* out = csession.output_tensor(i);
+      ASSERT_TRUE(out != nullptr);
+      EXPECT_EQ(TF_INT32, TF_TensorType(out));
+      EXPECT_EQ(1, TF_NumDims(out));
+      CompareInt32Tensor(expected_results[i], out);
+    }
+  }
+
+  // Run the host graph, which now contains a function and check that
+  // outputs are as expected.
+  void Run(const std::vector<std::pair<TF_Operation*, TF_Tensor*>>& inputs,
+           std::initializer_list<TF_Output> outputs,
+           const std::vector<int32_t>& expected_results) {
+    // Create a session for this graph.
+    CSession csession(host_graph_, s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+    csession.SetInputs(inputs);
+    csession.SetOutputs(outputs);
+    csession.Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+    for (int i = 0; i < expected_results.size(); ++i) {
+      TF_Tensor* out = csession.output_tensor(i);
+      ASSERT_TRUE(out != nullptr);
+      EXPECT_EQ(TF_INT32, TF_TensorType(out));
+      EXPECT_EQ(0, TF_NumDims(out));  // scalar
+      ASSERT_EQ(sizeof(int32_t), TF_TensorByteSize(out));
+      int32_t* output_contents = static_cast<int32_t*>(TF_TensorData(out));
+      EXPECT_EQ(expected_results[i], *output_contents);
+    }
+  }
+
+  void CompareInt32Tensor(const std::vector<int32_t>& expected, TF_Tensor* t) {
+    int32_t* data = static_cast<int32_t*>(TF_TensorData(t));
+    size_t size = TF_TensorByteSize(t);
+    ASSERT_EQ(expected.size() * sizeof(int32_t), size);
+    for (int i = 0; i < expected.size(); ++i) {
+      ASSERT_EQ(expected[i], data[i]) << "Different data at index " << i;
+    }
+  }
+
+  std::vector<TF_Output> ToOutput(const std::vector<TF_Operation*> ops) {
+    std::vector<TF_Output> out;
+    for (auto op : ops) {
+      out.push_back({op, 0});
+    }
+    return out;
+  }
+
+  void Define(int num_opers, const std::vector<TF_Operation*>& opers,
+              const std::vector<TF_Operation*>& inputs,
+              const std::vector<TF_Operation*>& outputs,
+              const char** output_names, bool expect_failure = false) {
+    DefineT(num_opers, opers, ToOutput(inputs), ToOutput(outputs), output_names,
+            expect_failure);
+  }
+
+  // An explicit `num_opers` is needed so that we can distinguish between the
+  // case of no operations specified (-1) and the case of an empty set of
+  // operations specified (0).
+  void DefineT(int num_opers, const std::vector<TF_Operation*>& opers,
+               const std::vector<TF_Output>& inputs,
+               const std::vector<TF_Output>& outputs, const char** output_names,
+               bool expect_failure = false) {
+    ASSERT_EQ(func_, nullptr);
+    func_ = TF_GraphToFunction(func_graph_, func_name_, num_opers,
+                               num_opers == -1 ? nullptr : opers.data(),
+                               inputs.size(), inputs.data(), outputs.size(),
+                               outputs.data(), output_names,
+                               /*opts=*/nullptr, s_);
+    if (expect_failure) {
+      ASSERT_EQ(func_, nullptr);
+      return;
+    }
+
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    ASSERT_NE(func_, nullptr);
+    TF_GraphAddFunction(host_graph_, func_, s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  TF_Operation* Use(const std::vector<TF_Operation*>& inputs) {
+    return UseT(ToOutput(inputs));
+  }
+
+  TF_Operation* UseT(const std::vector<TF_Output>& inputs) {
+    TF_Operation* op;
+    UseHelper(inputs, &op);
+    return op;
+  }
+
+  // All the *Helper methods are used as a workaround for the restrictions that
+  // one cannot call ASSERT_* methods in non-void-returning functions (when
+  // exceptions are disabled during compilation)
+  void UseHelper(const std::vector<TF_Output>& inputs, TF_Operation** op) {
+    TF_OperationDescription* desc =
+        TF_NewOperation(host_graph_, func_name_, func_node_name_);
+    for (auto input : inputs) {
+      TF_AddInput(desc, input);
+    }
+    // Set device to CPU because some ops inside the function might not be
+    // available on GPU.
+    TF_SetDevice(desc, "/cpu:0");
+    *op = TF_FinishOperation(desc, s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    ASSERT_NE(*op, nullptr);
+  }
+
+  FunctionDef fdef() {
+    tensorflow::FunctionDef fdef;
+    EXPECT_TRUE(GetFunctionDef(func_, &fdef));
+    return fdef;
+  }
+
+  // logging utility
+  template <class Container>
+  string ToString(const Container& v) {
+    std::stringstream ss;
+    ss << "{";
+    size_t i = 0;
+    for (const auto& e : v) {
+      if (i != 0) {
+        ss << ", ";
+      }
+      ss << e.ToString();
+      ++i;
+    }
+    ss << "}";
+    return ss.str();
+  }
+
+  void VerifyFDefNodes(const tensorflow::FunctionDef& fdef,
+                       const std::unordered_set<string>& nodes) {
+    ASSERT_EQ(nodes.size(), fdef.node_def_size())
+        << "Got unexpected number of nodes. Expected: ["
+        << str_util::Join(nodes, ", ")
+        << "] Actual nodes in fdef: " << fdef.DebugString();
+    for (const NodeDef& node_def : fdef.node_def()) {
+      ASSERT_TRUE(nodes.find(node_def.name()) != nodes.end())
+          << "Got unexpected node: " << node_def.name()
+          << " in fdef: " << fdef.DebugString();
+    }
+  }
+
+  void VerifyFDefInputs(const tensorflow::FunctionDef& fdef,
+                        const std::vector<IOSpec>& inputs) {
+    const OpDef& signature = fdef.signature();
+    ASSERT_EQ(inputs.size(), signature.input_arg_size());
+    for (int i = 0; i < inputs.size(); ++i) {
+      const OpDef::ArgDef& arg = signature.input_arg(i);
+      const IOSpec& in = inputs[i];
+      if (in.second != DT_INVALID) {
+        ASSERT_EQ(arg.type(), in.second)
+            << "Got unexpected type for input " << i
+            << ". fdef: " << fdef.DebugString();
+      }
+      ASSERT_EQ(arg.name(), in.first) << "Got unexpected name for input " << i
+                                      << ". fdef: " << fdef.DebugString();
+    }
+  }
+
+  void VerifyFDefOutputs(const tensorflow::FunctionDef& fdef,
+                         const std::vector<IOSpec>& outputs) {
+    const OpDef& signature = fdef.signature();
+    ASSERT_EQ(outputs.size(), signature.output_arg_size());
+    for (int i = 0; i < outputs.size(); ++i) {
+      const OpDef::ArgDef& arg = signature.output_arg(i);
+      const IOSpec& out = outputs[i];
+      if (out.second != DT_INVALID) {
+        ASSERT_EQ(arg.type(), out.second)
+            << "Got unexpected type for output " << i
+            << ". fdef: " << fdef.DebugString();
+      }
+      ASSERT_EQ(arg.name(), out.first) << "Got unexpected name for output " << i
+                                       << ". fdef: " << fdef.DebugString();
+    }
+  }
+
+  void VerifyFDefEdges(
+      const tensorflow::FunctionDef& fdef,
+      const std::vector<EdgeSpec>& e_edges,  // expected edges
+      const std::vector<EdgeSpec>& c_edges,  // expected ctrl edges
+      bool is_exact_edges = true) {
+    // Build a set of edges from fdef
+    std::set<EdgeSpec> a_edges;  // actual edges
+    // Get edges from inputs to body nodes and between body nodes
+    for (const NodeDef& node_def : fdef.node_def()) {
+      for (int i = 0; i < node_def.input_size(); ++i) {
+        const string& in = node_def.input(i);
+        const auto& v =
+            a_edges.insert({in, strings::StrCat(node_def.name(), ":", i)});
+        ASSERT_TRUE(v.second) << "Duplicate edge " << in << " -> "
+                              << strings::StrCat(node_def.name(), ":", i)
+                              << ". fdef: " << fdef.DebugString();
+      }
+    }
+    // Get edges from body nodes to outputs and from inputs to outputs
+    for (const OpDef::ArgDef& arg : fdef.signature().output_arg()) {
+      const auto& iter = fdef.ret().find(arg.name());
+      if (iter != fdef.ret().end()) {
+        const auto& v = a_edges.insert({iter->second, arg.name()});
+        ASSERT_TRUE(v.second) << "Duplicate edge " << iter->second << " -> "
+                              << arg.name() << ". fdef: " << fdef.DebugString();
+      } else {
+        const auto& v = a_edges.insert({arg.name(), arg.name()});
+        ASSERT_TRUE(v.second) << "Duplicate edge " << arg.name() << " -> "
+                              << arg.name() << ". fdef: " << fdef.DebugString();
+      }
+    }
+
+    // Verify edges
+    for (const EdgeSpec& e : e_edges) {
+      ASSERT_TRUE(a_edges.find(e) != a_edges.end())
+          << "Failed to find expected edge " << e.ToString()
+          << " in fdef: " << fdef.DebugString();
+    }
+
+    // If caller specified all edges, check that we have seen all
+    if (is_exact_edges) {
+      ASSERT_EQ(e_edges.size() + c_edges.size(), a_edges.size())
+          << "Expected edges: " << ToString(e_edges)
+          << " Expected Control edges: " << ToString(c_edges)
+          << " Actual edges: " << ToString(a_edges)
+          << " in fdef: " << fdef.DebugString();
+    }
+  }
+
+  void VerifyFDef(const std::unordered_set<string>& nodes,
+                  const std::vector<IOSpec>& inputs,
+                  const std::vector<IOSpec>& outputs,
+                  const std::vector<EdgeSpec>& e_edges,  // expected edges
+                  const std::vector<EdgeSpec>& c_edges,  // expected ctrl edges
+                  bool is_exact_edges = true) {
+    tensorflow::FunctionDef fdef;
+    ASSERT_TRUE(GetFunctionDef(func_, &fdef));
+    VerifyFDefNodes(fdef, nodes);
+    VerifyFDefInputs(fdef, inputs);
+    VerifyFDefOutputs(fdef, outputs);
+    VerifyFDefEdges(fdef, e_edges, c_edges, is_exact_edges);
+  }
+
+  const char* func_name_ = "MyFunc";
+  const char* func_node_name_ = "MyFunc_0";
+  TF_Status* s_;
+  TF_Graph* func_graph_;
+  TF_Graph* host_graph_;
+  TF_Function* func_;
+
+  // Workaround for not being able to initialize empty map using {}
+  std::unordered_set<string> empty_;
+};
+
+TEST_F(CApiFunctionTest, OneOp_ZeroInputs_OneOutput) {
+  /*
+   *                constant
+   *                   |
+   *                   v
+   */
+  // Define
+  TF_Operation* c = ScalarConst(10, func_graph_, s_, "scalar10");
+  Define(-1, {}, {}, {c}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* func_op = Use({});
+  Run({}, func_op, 10);
+  VerifyFDef({"scalar10_0"}, {}, {{"scalar10", DT_INT32}},
+             {{"scalar10_0:output:0", "scalar10"}}, {});
+}
+
+TEST_F(CApiFunctionTest, OneOp_OneInput_OneOutput) {
+  /*
+   *                   |
+   *                   v
+   *                 negate
+   *                   |
+   *                   v
+   */
+  // Define
+  TF_Operation* feed = Placeholder(func_graph_, s_);
+  TF_Operation* neg = Neg(feed, func_graph_, s_);
+  Define(-1, {}, {feed}, {neg}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, -3);
+  VerifyFDef({"neg_0"}, {{"feed", DT_INT32}}, {{"neg", DT_INT32}},
+             {{"feed", "neg_0:0"}, {"neg_0:y:0", "neg"}}, {});
+}
+
+TEST_F(CApiFunctionTest, ZeroOps_Identity) {
+  /*
+   *                   |
+   *                   |
+   *                   |
+   *                   v
+   */
+  // Define
+  TF_Operation* feed = Placeholder(func_graph_, s_);
+  Define(-1, {}, {feed}, {feed}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 3);
+  VerifyFDef(empty_, {{"feed", DT_INT32}}, {{"feed_0", DT_INT32}},
+             {{"feed", "feed_0"}}, {});
+}
+
+TEST_F(CApiFunctionTest, ZeroOps_Permutation) {
+  /*
+   *                   |   |
+   *                   \  /
+   *                    \/
+   *                    x
+   *                   /\
+   *                  /  \
+   *                 |   |
+   *                 v   v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  Define(-1, {}, {feed1, feed2}, {feed2, feed1}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_);
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, {{func_op, 0}, {func_op, 1}}, {3, 2});
+  VerifyFDef(empty_, M({{"feed1"}, {"feed2"}}), M({{"feed2_0"}, {"feed1_0"}}),
+             {{"feed1", "feed1_0"}, {"feed2", "feed2_0"}}, {});
+}
+
+TEST_F(CApiFunctionTest, OneOp_TwoInputs_OneOutput) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add
+   *                   |
+   *                   v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* add = Add(feed1, feed2, func_graph_, s_);
+  Define(-1, {}, {feed1, feed2}, {add}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_);
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3);
+  VerifyFDef(
+      {"add_0"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}),
+      {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}}, {});
+}
+
+TEST_F(CApiFunctionTest, OneOp_TwoInputs_ZeroOutputs) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add
+   *
+   *            (output ignored)
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  Add(feed1, feed2, func_graph_, s_);
+  Define(-1, {}, {feed1, feed2}, {}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_);
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  Use({two, func_feed});
+  VerifyFDef({"add"}, M({{"feed1"}, {"feed2"}}), {},
+             {{"feed1", "add:0"}, {"feed2", "add:1"}}, {});
+}
+
+TEST_F(CApiFunctionTest, TwoOps_ThreeInputs_OneOutput) {
+  /*
+   *                  |  |   |
+   *                  v  v   /
+   *                  add1  /
+   *                   |   |
+   *                   v   v
+   *                   add2
+   *                    |
+   *                    v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* feed3 = Placeholder(func_graph_, s_, "feed3");
+  TF_Operation* add1 = Add(feed1, feed2, func_graph_, s_, "add1");
+  TF_Operation* add2 = Add(add1, feed3, func_graph_, s_, "add2");
+  Define(-1, {}, {feed1, feed2, feed3}, {add2}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_, "two");
+  TF_Operation* ten = ScalarConst(10, host_graph_, s_, "ten");
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, ten, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 10 + 3);
+  VerifyFDef({"add1", "add2_0"}, M({{"feed1"}, {"feed2"}, {"feed3"}}),
+             M({{"add2"}}),
+             {{"feed1", "add1:0"},
+              {"feed2", "add1:1"},
+              {"add1:sum:0", "add2_0:0"},
+              {"feed3", "add2_0:1"},
+              {"add2_0:sum:0", "add2"}},
+             {});
+}
+
+TEST_F(CApiFunctionTest, OneOp_TwoInputs_TwoDuplicateOutputs) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add
+   *                   |
+   *                 +-+-+
+   *                 |   |
+   *                 v   v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* add = Add(feed1, feed2, func_graph_, s_);
+  Define(-1, {}, {feed1, feed2}, {add, add}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_);
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, {{func_op, 0}, {func_op, 1}}, {5, 5});
+  VerifyFDef({"add_1"}, M({{"feed1"}, {"feed2"}}), M({{"add"}, {"add_0"}}),
+             {{"feed1", "add_1:0"},
+              {"feed2", "add_1:1"},
+              {"add_1:sum:0", "add"},
+              {"add_1:sum:0", "add_0"}},
+             {});
+}
+
+TEST_F(CApiFunctionTest, TwoOps_ThreeInputs_TwoOutputs) {
+  /*
+   *                  |  |  |
+   *                  v  v  /
+   *                  add  /
+   *                   |  |
+   *                 +-+  |
+   *                 | |  |
+   *                 | v  v
+   *                 | add
+   *                 |  |
+   *                 v  v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* feed3 = Placeholder(func_graph_, s_, "feed3");
+  TF_Operation* add1 = Add(feed1, feed2, func_graph_, s_, "add1");
+  TF_Operation* add2 = Add(add1, feed3, func_graph_, s_, "add2");
+  Define(-1, {}, {feed1, feed2, feed3}, {add1, add2}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_, "two");
+  TF_Operation* ten = ScalarConst(10, host_graph_, s_, "ten");
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, ten, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, {{func_op, 0}, {func_op, 1}}, {12, 15});
+  VerifyFDef({"add1_0", "add2_0"}, M({{"feed1"}, {"feed2"}, {"feed3"}}),
+             M({{"add1"}, {"add2"}}),
+             {{"feed1", "add1_0:0"},
+              {"feed2", "add1_0:1"},
+              {"add1_0:sum:0", "add2_0:0"},
+              {"feed3", "add2_0:1"},
+              {"add1_0:sum:0", "add1"},
+              {"add2_0:sum:0", "add2"}},
+             {});
+}
+
+TEST_F(CApiFunctionTest, FromSubsetOfOps) {
+  /*
+   *                  |  |  |
+   *                  v  v  /
+   *                  add  /
+   *                   |  |
+   *               +---+--+---+
+   *  Ops used     |   |  |   |
+   *  for func     |   v  v   |
+   *     |         |   add    |
+   *     +-------> |    |     |
+   *               |    v     |
+   *               |          |
+   *               +----------+
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* feed3 = Placeholder(func_graph_, s_, "feed3");
+  TF_Operation* add1 = Add(feed1, feed2, func_graph_, s_, "add1");
+  TF_Operation* add2 = Add(add1, feed3, func_graph_, s_, "add2");
+  Define(1, {add2}, {add1, feed3}, {add2}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_, "two");
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3);
+  VerifyFDef(
+      {"add2_0"}, M({{"add1"}, {"feed3"}}), M({{"add2"}}),
+      {{"add1", "add2_0:0"}, {"feed3", "add2_0:1"}, {"add2_0:sum:0", "add2"}},
+      {});
+}
+
+TEST_F(CApiFunctionTest, UsingOneOutputOfSplit) {
+  /*
+   *                      feed
+   *                       |
+   *             +---------+---+
+   *             | const0  |   |
+   *             |    |    |   |
+   *             |    v    /   |
+   *             |    split    |
+   *             |   |  |  |   |
+   *             |   v  |  v   |
+   *             |      |      |
+   *             +------+------+
+   *                    |
+   *                    v
+   *
+   *  Only the second output from split is used as function output
+   */
+  // Define
+  TF_Operation* feed = Placeholder(func_graph_, s_);
+  TF_Operation* split = Split3(feed, func_graph_, s_);
+  DefineT(-1, {}, {{feed, 0}}, {{split, 1}}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({func_feed});
+  RunT({{func_feed, Int32Tensor({1, 2, 3, 4, 5, 6})}}, {{func_op, 0}},
+       {{3, 4}});
+  VerifyFDef({"split3_const0", "split3_0"}, M({{"feed"}}), M({{"split3"}}),
+             {{"split3_const0:output:0", "split3_0:0"},
+              {"feed", "split3_0:1"},
+              {"split3_0:output:1", "split3"}},
+             {});
+}
+
+TEST_F(CApiFunctionTest, UsingTwoOutputsOfSplit) {
+  /*
+   *                      feed
+   *                       |
+   *             +---------+---+
+   *             | const0  |   |
+   *             |    |    |   |
+   *             |    v    /   |
+   *             |    split    |
+   *             |   |  |  |   |
+   *             |   |  v  |   |
+   *             |   |     |   |
+   *             +---+-----+---+
+   *                 |     |
+   *                 v     v
+   *
+   *  Second output from split is not used as function output
+   */
+  // Define
+  TF_Operation* feed = Placeholder(func_graph_, s_);
+  TF_Operation* split = Split3(feed, func_graph_, s_);
+  DefineT(-1, {}, {{feed, 0}}, {{split, 0}, {split, 2}}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({func_feed});
+  RunT({{func_feed, Int32Tensor({1, 2, 3, 4, 5, 6})}},
+       {{func_op, 0}, {func_op, 1}}, {{1, 2}, {5, 6}});
+  VerifyFDef({"split3_const0", "split3_1"}, M({{"feed"}}),
+             M({{"split3"}, {"split3_0"}}),
+             {{"split3_const0:output:0", "split3_1:0"},
+              {"feed", "split3_1:1"},
+              {"split3_1:output:0", "split3"},
+              {"split3_1:output:2", "split3_0"}},
+             {});
+}
+
+TEST_F(CApiFunctionTest, UsingTwoOutputsOfSplitAsInputs) {
+  /*
+   *                    |
+   *                    v
+   *                  split
+   *                 |  |  |
+   *                 |  v  |
+   *                 |     |
+   *             +---+-----+---+
+   *             |   |     |   |
+   *             |   v     v   |
+   *             |     add     |
+   *             |      |      |
+   *             |      |      |
+   *             +------+------+
+   *                    |
+   *                    v
+   */
+  // Define
+  TF_Operation* feed = Placeholder(func_graph_, s_);
+  TF_Operation* split = Split3(feed, func_graph_, s_);
+  TF_Operation* add = Add({split, 0}, {split, 2}, func_graph_, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  DefineT(1, {add}, {{split, 0}, {split, 2}}, {{add, 0}}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_, "two");
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3);
+  VerifyFDef(
+      {"add_0"}, M({{"split3"}, {"split3_0"}}), M({{"add"}}),
+      {{"split3", "add_0:0"}, {"split3_0", "add_0:1"}, {"add_0:sum:0", "add"}},
+      {});
+}
+
+TEST_F(CApiFunctionTest, NodesUsedInInputsMustHaveSingleOutput) {
+  /*
+   *                    |
+   *                    v
+   *                  split
+   *                 |  |  |
+   *                 |  v  |
+   *                 |     |
+   *       input --->|     |<--- input
+   *                 |     |
+   *                 v     v
+   *                   add
+   *                    |
+   *                    |
+   *                    v
+   */
+  // Define
+  TF_Tensor* tensor_123 = Int32Tensor({1, 2, 3});
+  TF_Operation* c = Const(tensor_123, func_graph_, s_, "const_array");
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  TF_Operation* split = Split3(c, func_graph_, s_);
+  TF_Operation* add = Add({split, 0}, {split, 2}, func_graph_, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  DefineT(-1, {}, {{split, 0}, {split, 2}}, {{add, 0}}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("When `num_opers` is set to -1, nodes referenced in "
+                   "`inputs` must have a single output. Node split3 has "
+                   "3 outputs. Encountered while creating function 'MyFunc'"),
+            string(TF_Message(s_)));
+
+  TF_DeleteTensor(tensor_123);
+}
+
+TEST_F(CApiFunctionTest, FunctionWithWhileLoop) {
+  // Inputs to the while loop and the function as a whole
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+
+  // Outputs of the while loop corresponding to the two inputs above
+  // The first one will the function's output
+  std::vector<TF_Output> outputs;
+
+  // Add while loop to func_graph_
+  {
+    // The inputs to the while loop
+    std::vector<TF_Output> inputs = {{feed1, 0}, {feed2, 0}};
+    std::unique_ptr<TF_WhileParams> params(new TF_WhileParams(
+        TF_NewWhile(func_graph_, &inputs[0], inputs.size(), s_)));
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    params->name = "test_loop";
+
+    // Initialize outputs so we can easily detect errors/bugs
+    outputs.resize(2, {nullptr, -1});
+
+    // Create loop: while (input1 < input2) input1 += input2 + 1
+    TF_Operation* less_than = LessThan(
+        params->cond_inputs[0], params->cond_inputs[1], params->cond_graph, s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    params->cond_output = {less_than, 0};
+
+    TF_Operation* add1 = Add(params->body_inputs[0], params->body_inputs[1],
+                             params->body_graph, s_, "add1");
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    TF_Operation* one = ScalarConst(1, params->body_graph, s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    TF_Operation* add2 = Add(add1, one, params->body_graph, s_, "add2");
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    params->body_outputs[0] = {add2, 0};
+    params->body_outputs[1] = params->body_inputs[1];
+
+    // Finalize while loop
+    TF_FinishWhile(params.get(), s_, &outputs[0]);
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  // Define function, use it in graph, and run
+  DefineT(-1, {}, {{feed1, 0}, {feed2, 0}}, {outputs[0]}, nullptr);
+  TF_Operation* five = ScalarConst(5, host_graph_, s_, "five");
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({func_feed, five});
+  Run({{func_feed, Int32Tensor(2)}}, func_op, 2 /*+=*/ + 5 + 1);
+
+  // Verify input, output, and subset of edges in fdef.
+  // The subset of edges we verify is a chain between feed1 and output to
+  // make sure that the correct output is picked.
+  tensorflow::FunctionDef fdef;
+  ASSERT_TRUE(GetFunctionDef(func_, &fdef));
+  VerifyFDefInputs(fdef, M({{"feed1"}, {"feed2"}}));
+  VerifyFDefOutputs(fdef, M({{"test_loop_exit"}}));
+  VerifyFDefEdges(fdef,
+                  {{"feed1", "test_loop/Enter:0"},
+                   {"test_loop/Enter:output:0", "test_loop/Merge:0"},
+                   {"test_loop/Merge:output:0", "test_loop/Switch:0"},
+                   {"test_loop/Switch:output_false:0", "test_loop/Exit:0"},
+                   {"test_loop/Exit:output:0", "test_loop_exit"}},
+                  {}, false);
+}
+
+TEST_F(CApiFunctionTest, ControlDependency) {
+  /*
+   *                  |  |    scalar
+   *                  |  |    .
+   *                  v  v   . <---- control dependency
+   *                  add < -
+   *                   |
+   *                   v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* five = ScalarConst(5, func_graph_, s_);
+  TF_Operation* add =
+      AddWithCtrlDependency(feed1, feed2, func_graph_, five, s_);
+  EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  Define(-1, {}, {feed1, feed2}, {add}, nullptr);
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_);
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3);
+  VerifyFDef(
+      {"add_0", "scalar"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}),
+      {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}},
+      {{"scalar", "add_0"}});
+}
+
+TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody) {
+  /*
+   *                  |  |    scalar
+   *                  |  |    .
+   *                  v  v   . <---- control dependency
+   *                  add < -
+   *                   |
+   *                   v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* five = ScalarConst(5, func_graph_, s_);
+  TF_Operation* add =
+      AddWithCtrlDependency(feed1, feed2, func_graph_, five, s_);
+  EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  Define(1, {add}, {feed1, feed2}, {add}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("The source of control edge [id=3 scalar:-1 -> add:-1] "
+                   "is not in the body. Encountered while creating "
+                   "function 'MyFunc'"),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody_FromInputNode) {
+  /*
+   *                  |  |.
+   *                  |  |  .
+   *                  |  |   .
+   *                  v  v   . <---- control dependency
+   *                  add < -
+   *                   |
+   *                   v
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* add =
+      AddWithCtrlDependency(feed1, feed2, func_graph_, feed1, s_);
+  EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  Define(-1, {}, {feed1, feed2}, {add}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("The source of control edge [id=3 feed1:-1 -> add:-1] "
+                   "is not in the body. Encountered while creating "
+                   "function 'MyFunc'"),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, DuplicateInputsAreNotAllowed) {
+  /*
+   *                  feed
+   *                   |
+   *                  +++
+   *                  | |
+   *              +---+-+---+
+   *              |   | |   |
+   *              |   v v   |
+   *              |   add   |
+   *              |    |    |
+   *              |    |    |
+   *              +----+----+
+   *                   |
+   *                   v
+   */
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* add = Add(feed1, feed1, func_graph_, s_);
+  Define(-1, {}, {feed1, feed1}, {add}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(
+      string("TF_Output feed1:0 appears more than once in the input list"),
+      string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, InvalidInputTensor_HighIndex) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add
+   *                   |
+   *                   v
+   */
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* add = Add(feed1, feed2, func_graph_, s_);
+  DefineT(-1, {}, {{feed1, 0}, {feed2, 2}}, {{add, 0}}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("Node 'feed2' (type: 'Placeholder', num of outputs: 1) does "
+                   "not have output 2\n\tEncountered while processing "
+                   "input 1 into function 'MyFunc'"),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, InvalidInputTensor_BadNodePtr) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add
+   *                   |
+   *                   v
+   */
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* add = Add(feed1, feed2, func_graph_, s_);
+  DefineT(-1, {}, {{feed1, 0}, {nullptr, 0}}, {{add, 0}}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("Node is null\n\tEncountered while processing input 1 "
+                   "into function 'MyFunc'"),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, InvalidOutputTensor_HighIndex) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add
+   *                   |
+   *                   v
+   */
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* add = Add(feed1, feed2, func_graph_, s_);
+  DefineT(-1, {}, {{feed1, 0}, {feed2, 0}}, {{add, 3}}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("Node 'add' (type: 'AddN', num of outputs: 1) does "
+                   "not have output 3\n\tEncountered while processing "
+                   "output 0 from function 'MyFunc'"),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, InvalidOutputTensor_BadNodePtr) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add
+   *                   |
+   *                   v
+   */
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  Add(feed1, feed2, func_graph_, s_);
+  DefineT(-1, {}, {{feed1, 0}, {feed2, 0}}, {{nullptr, 3}}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("Node is null\n\tEncountered while processing output 0 "
+                   "from function 'MyFunc'"),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, NodeMissingInput) {
+  /*
+   *        input---> |  | <----missing input
+   *                  v  v
+   *        body----> add
+   *                   |
+   *                   v
+   */
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* add = Add(feed1, feed2, func_graph_, s_);
+  DefineT(1, {add}, {{feed1, 0}}, {{add, 0}}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("Input 1, 'feed2:0', of node 'add' in function 'MyFunc' "
+                   "is not available. You might need to include it in inputs "
+                   "or include its source node in the body"),
+            string(TF_Message(s_)));
+}
+
+TEST_F(CApiFunctionTest, OutputOpNotInBody) {
+  /*
+   *                  |  |
+   *                  v  v
+   *                  add    scalar    (scalar not included in body)
+   *                   |       |
+   *                   v       v       (function has two outputs)
+   */
+  // Define
+  TF_Operation* feed1 = Placeholder(func_graph_, s_, "feed1");
+  TF_Operation* feed2 = Placeholder(func_graph_, s_, "feed2");
+  TF_Operation* scalar = ScalarConst(2, func_graph_, s_);
+  TF_Operation* add = Add(feed1, feed2, func_graph_, s_);
+  Define(1, {add}, {feed1, feed2}, {add, scalar}, nullptr, true);
+  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
+  EXPECT_EQ(string("TF_Output scalar:0 is neither in the function body nor "
+                   "among function inputs. Encountered while creating "
+                   "function 'MyFunc'"),
+            string(TF_Message(s_)));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index f7d25dce8f5..68c324f2b99 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -130,6 +130,11 @@ struct TF_DeviceList {
   std::vector<tensorflow::DeviceAttributes> response;
 };
 
+struct TF_Function {
+  // Currently contains a single function and no gradients
+  tensorflow::FunctionDefLibrary fdef_lib;
+};
+
 namespace tensorflow {
 
 class TensorCApi {
@@ -141,7 +146,12 @@ class TensorCApi {
   }
 };
 
+Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
+
 TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
+
+Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 0aa60fb45dd..c4420290099 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -829,7 +829,7 @@ TEST(CAPI, ShapeInferenceError) {
   TF_Operation* vec3 = Const(vec3_tensor.get(), graph, status, "vec3");
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  TF_Operation* add = Add(vec2, vec3, graph, status);
+  TF_Operation* add = AddNoCheck(vec2, vec3, graph, status);
   ASSERT_NE(TF_OK, TF_GetCode(status));
   ASSERT_TRUE(add == nullptr);
 
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index 21603c1a07c..9cd978c97ea 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/c/c_test_util.h"
 
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 using tensorflow::GraphDef;
@@ -36,6 +38,23 @@ TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) {
   return t;
 }
 
+TF_Tensor* Int32Tensor(const int64_t* dims, int num_dims,
+                       const int32_t* values) {
+  int64_t num_values = 1;
+  for (int i = 0; i < num_dims; ++i) {
+    num_values *= dims[i];
+  }
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT32, dims, num_dims, sizeof(int32_t) * num_values);
+  memcpy(TF_TensorData(t), values, sizeof(int32_t) * num_values);
+  return t;
+}
+
+TF_Tensor* Int32Tensor(const std::vector<int32_t>& values) {
+  int64_t dims = values.size();
+  return Int32Tensor(&dims, 1, values.data());
+}
+
 TF_Tensor* Int32Tensor(int32_t v) {
   const int num_bytes = sizeof(int32_t);
   int32_t* values = new int32_t[1];
@@ -44,19 +63,40 @@ TF_Tensor* Int32Tensor(int32_t v) {
                       &Int32Deallocator, nullptr);
 }
 
-TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name) {
+// All the *Helper methods are used as a workaround for the restrictions that
+// one cannot call ASSERT_* methods in non-void-returning functions (when
+// exceptions are disabled during compilation)
+void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
+                       TF_Operation** op) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
   TF_SetAttrType(desc, "dtype", TF_INT32);
-  return TF_FinishOperation(desc, s);
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name) {
+  TF_Operation* op;
+  PlaceholderHelper(graph, s, name, &op);
+  return op;
+}
+
+void ConstHelper(TF_Tensor* t, TF_Graph* graph, TF_Status* s, const char* name,
+                 TF_Operation** op) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
+  TF_SetAttrTensor(desc, "value", t, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_SetAttrType(desc, "dtype", TF_TensorType(t));
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
 }
 
 TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
                     const char* name) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
-  TF_SetAttrTensor(desc, "value", t, s);
-  if (TF_GetCode(s) != TF_OK) return nullptr;
-  TF_SetAttrType(desc, "dtype", TF_TensorType(t));
-  return TF_FinishOperation(desc, s);
+  TF_Operation* op;
+  ConstHelper(t, graph, s, name, &op);
+  return op;
 }
 
 TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
@@ -65,11 +105,39 @@ TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
   return Const(tensor.get(), graph, s, name);
 }
 
-TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                  TF_Status* s, const char* name) {
+void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s,
+               const char* name, TF_Operation** op, bool check) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
   TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
   TF_AddInputList(desc, add_inputs, 2);
+  *op = TF_FinishOperation(desc, s);
+  if (check) {
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    ASSERT_NE(*op, nullptr);
+  }
+}
+
+TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name) {
+  TF_Operation* op;
+  AddHelper(l, r, graph, s, name, &op, true);
+  return op;
+}
+
+TF_Operation* AddNoCheck(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                         TF_Status* s, const char* name) {
+  TF_Operation* op;
+  AddHelper(l, r, graph, s, name, &op, false);
+  return op;
+}
+
+TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r,
+                                    TF_Graph* graph, TF_Operation* ctrl_op,
+                                    TF_Status* s, const char* name) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
+  TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
+  TF_AddInputList(desc, add_inputs, 2);
+  TF_AddControlInput(desc, ctrl_op);
   return TF_FinishOperation(desc, s);
 }
 
@@ -81,11 +149,20 @@ TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
   return TF_FinishOperation(desc, s);
 }
 
-TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) {
+void NegHelper(TF_Operation* n, TF_Graph* graph, TF_Status* s,
+               TF_Operation** op) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "Neg", "neg");
   TF_Output neg_input = {n, 0};
   TF_AddInput(desc, neg_input);
-  return TF_FinishOperation(desc, s);
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s) {
+  TF_Operation* op;
+  NegHelper(n, graph, s, &op);
+  return op;
 }
 
 TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
@@ -96,6 +173,32 @@ TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
   return TF_FinishOperation(desc, s);
 }
 
+void Split3Helper(TF_Operation* input, TF_Graph* graph, TF_Status* s,
+                  const char* name, TF_Operation** op) {
+  TF_Operation* zero = ScalarConst(
+      0, graph, s, ::tensorflow::strings::StrCat(name, "_const0").c_str());
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Split", name);
+  TF_AddInput(desc, {zero, 0});
+  TF_AddInput(desc, {input, 0});
+  TF_SetAttrInt(desc, "num_split", 3);
+  TF_SetAttrType(desc, "T", TF_INT32);
+  // Set device to CPU since there is no version of split for int32 on GPU
+  // TODO(iga): Convert all these helpers and tests to use floats because
+  // they are usually available on GPUs. After doing this, remove TF_SetDevice
+  // call in c_api_function_test.cc
+  TF_SetDevice(desc, "/cpu:0");
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s,
+                     const char* name) {
+  TF_Operation* op;
+  Split3Helper(input, graph, s, name, &op);
+  return op;
+}
+
 bool IsPlaceholder(const tensorflow::NodeDef& node_def) {
   if (node_def.op() != "Placeholder" || node_def.name() != "feed") {
     return false;
@@ -196,6 +299,18 @@ bool GetNodeDef(TF_Operation* oper, tensorflow::NodeDef* node_def) {
   return ret;
 }
 
+bool GetFunctionDef(TF_Function* func, tensorflow::FunctionDef* func_def) {
+  TF_Status* s = TF_NewStatus();
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_FunctionToFunctionDef(func, buffer, s);
+  bool ret = TF_GetCode(s) == TF_OK;
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  if (ret) ret = func_def->ParseFromArray(buffer->data, buffer->length);
+  TF_DeleteBuffer(buffer);
+  TF_DeleteStatus(s);
+  return ret;
+}
+
 bool GetAttrValue(TF_Operation* oper, const char* attr_name,
                   tensorflow::AttrValue* attr_value, TF_Status* s) {
   TF_Buffer* buffer = TF_NewBuffer();
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index 0c0ba667bd0..a927739d462 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -33,6 +33,13 @@ typedef std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)>
 // Create a tensor with values of type TF_INT8 provided by `values`.
 TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values);
 
+// Create a tensor with values of type TF_INT32 provided by `values`.
+TF_Tensor* Int32Tensor(const int64_t* dims, int num_dims,
+                       const int32_t* values);
+
+// Create 1 dimensional tensor with values from `values`
+TF_Tensor* Int32Tensor(const std::vector<int32_t>& values);
+
 TF_Tensor* Int32Tensor(int32_t v);
 
 TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
@@ -47,6 +54,13 @@ TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s,
 TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                   TF_Status* s, const char* name = "add");
 
+TF_Operation* AddNoCheck(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                         TF_Status* s, const char* name = "add");
+
+TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r,
+                                    TF_Graph* graph, TF_Operation* ctrl_op,
+                                    TF_Status* s, const char* name = "add");
+
 TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
                   const char* name = "add");
 
@@ -54,6 +68,10 @@ TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s);
 
 TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s);
 
+// Split `input` along the first dimention into 3 tensors
+TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s,
+                     const char* name = "split3");
+
 bool IsPlaceholder(const tensorflow::NodeDef& node_def);
 
 bool IsScalarConst(const tensorflow::NodeDef& node_def, int v);
@@ -66,6 +84,8 @@ bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
 
 bool GetNodeDef(TF_Operation* oper, tensorflow::NodeDef* node_def);
 
+bool GetFunctionDef(TF_Function* func, tensorflow::FunctionDef* func_def);
+
 bool GetAttrValue(TF_Operation* oper, const char* attr_name,
                   tensorflow::AttrValue* attr_value, TF_Status* s);
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 01e251a1ac2..e70539ceefa 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -151,10 +151,11 @@ TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   return TF_SessionListDevices(ctx->session, status);
 }
 
-TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t) {
-  return new TFE_TensorHandle(
-      tensorflow::TensorCApi::MakeTensor(t->dtype, t->shape, t->buffer),
-      nullptr);
+TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
+  tensorflow::Tensor tensor;
+  status->status = tensorflow::TF_TensorToTensor(t, &tensor);
+  if (!status->status.ok()) return nullptr;
+  return new TFE_TensorHandle(tensor, nullptr);
 }
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) { delete h; }
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 476c9288f89..a54d206a307 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -20,6 +20,25 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 
+// Macro to control visibility of exported symbols in the shared library (.so,
+// .dylib, .dll).
+// This duplicates the TF_EXPORT macro definition in
+// tensorflow/core/platform/macros.h in order to keep this .h file independent
+// of any other includes.$a
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(COMPILER_MSVC)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // COMPILER_MSVC
+#endif  // SWIG
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -30,11 +49,11 @@ extern "C" {
 // TODO(ashankar): Merge with TF_Session?
 typedef struct TFE_Context TFE_Context;
 
-extern TFE_Context* TFE_NewContext(const TF_SessionOptions* opts,
-                                   TF_Status* status);
-extern void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status);
-extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
-                                             TF_Status* status);
+TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext(const TF_SessionOptions* opts,
+                                                  TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status);
+TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
+                                                            TF_Status* status);
 
 // A handle to a tensor on a device.
 //
@@ -43,14 +62,15 @@ extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
 // placed in memory of different devices or remote address spaces.
 typedef struct TFE_TensorHandle TFE_TensorHandle;
 
-extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t);
-extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
-extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h);
-extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h);
-extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index);
-extern const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h);
-extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
-                                          TF_Status* status);
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
+                                                            TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
+TF_CAPI_EXPORT extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h);
+TF_CAPI_EXPORT extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h);
+TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index);
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h);
+TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
+                                                         TF_Status* status);
 
 // Create a new TFE_TensorHandle with the same contents as 'h' but placed
 // in the memory of the device name 'device_name'.
@@ -58,10 +78,10 @@ extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
 // that shares the underlying buffer. Otherwise, it currently requires at least
 // one of the source or destination devices to be CPU (i.e., for the source or
 // destination tensor to be placed in host memory).
-extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
-                                                      TFE_Context* ctx,
-                                                      const char* device_name,
-                                                      TF_Status* status);
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
+                                                                     TFE_Context* ctx,
+                                                                     const char* device_name,
+                                                                     TF_Status* status);
 
 // Description of the TensorFlow op to execute.
 //
@@ -76,49 +96,49 @@ extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
 //     the additional sanity checks there seem unnecessary;
 typedef struct TFE_Op TFE_Op;
 
-extern TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
-                         TF_Status* status);
-extern void TFE_DeleteOp(TFE_Op* op);
+TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
+                                        TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
 // TODO(ashankar): TFE_OpSetDevice and TFE_Execute should not have a TFE_Context
 // parameter. Instead, the TFE_Context should be captured when creating the
 // TFE_Op.
-extern void TFE_OpSetDevice(TFE_Op* op, TFE_Context* ctx,
-                            const char* device_name, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, TFE_Context* ctx,
+                                           const char* device_name, TF_Status* status);
 
-extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);
 
-extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
-                                     unsigned char* is_list, TF_Status* status);
+TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
+                                                    unsigned char* is_list, TF_Status* status);
 
-extern void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name,
-                                const char* value);
-extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value);
-extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value);
-extern void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name,
-                              unsigned char value);
-extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name,
-                              TF_DataType value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name,
+                                               const char* value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name,
+                                             unsigned char value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name,
+                                             TF_DataType value);
 // If the number of dimensions is unknown, `num_dims` must be set to
 // -1 and `dims` can be null.  If a dimension is unknown, the
 // corresponding entry in the `dims` array must be -1.
-extern void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name,
-                               const int64_t* dims, const int num_dims,
-                               TF_Status* out_status);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name,
+                                              const int64_t* dims, const int num_dims,
+                                              TF_Status* out_status);
 
-extern void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name,
-                                    const char** value, int num_values);
-extern void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
-                                 const int64_t* values, int num_values);
-extern void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name,
-                                   const float* values, int num_values);
-extern void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
-                                  const unsigned char* values, int num_values);
-extern void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
-                                  const TF_DataType* values, int num_values);
-extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
-                                   const int64_t** dims, const int* num_dims,
-                                   int num_values, TF_Status* out_status);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name,
+                                                   const char** value, int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
+                                                const int64_t* values, int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name,
+                                                  const float* values, int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
+                                                 const unsigned char* values, int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
+                                                 const TF_DataType* values, int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
+                                                  const int64_t** dims, const int* num_dims,
+                                                  int num_values, TF_Status* out_status);
 
 // Execute the operation defined by 'op' and return handles to computed
 // tensors in 'retvals'.
@@ -128,14 +148,14 @@ extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
 //
 // On return, 'num_retvals' will be set to the actual number of outputs
 // returned by the operation.
-extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals,
-                        int* num_retvals, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals,
+                                       int* num_retvals, TF_Status* status);
 
 // Add a function (serialized FunctionDef protocol buffer) to ctx so
 // that it can be invoked using TFE_Execute.
-extern void TFE_ContextAddFunctionDef(TFE_Context* ctx,
-                                      const char* serialized_function_def,
-                                      size_t size, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(TFE_Context* ctx,
+                                                     const char* serialized_function_def,
+                                                     size_t size, TF_Status* status);
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 6f5c21c9472..d19583a3abe 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -34,7 +34,9 @@ TFE_TensorHandle* TestMatrixTensorHandle() {
   TF_Tensor* t = TF_AllocateTensor(
       TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
   memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
-  TFE_TensorHandle* th = TFE_NewTensorHandle(t);
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteTensor(t);
   return th;
 }
@@ -383,7 +385,8 @@ TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value,
   memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get()));
 
   std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      value_handle(TFE_NewTensorHandle(t.get()), TFE_DeleteTensorHandle);
+      value_handle(TFE_NewTensorHandle(t.get(), status), TFE_DeleteTensorHandle);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
 
   TFE_OpAddInput(op, value_handle.get(), status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
diff --git a/tensorflow/c/version_script.lds b/tensorflow/c/version_script.lds
index 9bdc6dcc2e0..c352a1440d1 100644
--- a/tensorflow/c/version_script.lds
+++ b/tensorflow/c/version_script.lds
@@ -2,6 +2,7 @@ VERS_1.0 {
   # Export symbols in c_api.h.
   global:
     *TF_*;
+    *TFE_*;
 
   # Hide everything else.
   local:
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 66a943410e2..1868207148d 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -77,6 +77,10 @@ class SymbolicGradientBuilder {
   Status CallGradFunction(const Operation& op,
                           const std::vector<Output>& grad_inputs,
                           std::vector<Output>* grad_outputs);
+  
+  // Returns a list mapping whether each node in the graph is reachable
+  // from outputs_. Keyed by node id.
+  std::vector<bool> GetReachableNodes();
 
   const Scope& scope_;
   const ops::GradOpRegistry* registry_;
@@ -143,11 +147,36 @@ Status SymbolicGradientBuilder::BackpropAlongEdge(const Output& dst_grad,
   return Status::OK();
 }
 
+std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
+  std::vector<bool> reachable_nodes(scope_.graph()->num_node_ids(), false);
+  std::deque<Node*> queue;
+  for (const Output& out : outputs_) {
+    if (!reachable_nodes[out.node()->id()]) {
+      queue.push_back(out.node());
+      reachable_nodes[out.node()->id()] = true;
+    }
+  }
+  
+  while (!queue.empty()) {
+    Node* n = queue.front();
+    queue.pop_front();
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge()) continue;
+      queue.push_back(e->src());
+      reachable_nodes[e->src()->id()] = true;
+    }
+  }
+  return reachable_nodes;
+}
+
 Status SymbolicGradientBuilder::Initialize() {
   if (outputs_.size() != grad_inputs_.size()) {
     return errors::InvalidArgument(
         "Must specify a gradient input for each output.");
   }
+  std::vector<bool> reachable_nodes = GetReachableNodes();
+  // TODO(theflofly) Check that inputs_ are reachable from
+  // outputs_ using reachable_nodes
   grad_outputs_->clear();
   grad_outputs_->resize(inputs_.size());
   // Populate `output_nodes_` from node ids in `outputs_`.
@@ -188,12 +217,15 @@ Status SymbolicGradientBuilder::Initialize() {
       if (output_nodes_.find(n->id()) == output_nodes_.end()) {
         // Internal node: continue BFS along connected outputs.
         for (const Edge* e : n->out_edges()) {
-          if (e->IsControlEdge()) continue;
-          ++num_expected_backprops;
+          // If a node is not reachable from outputs_,
+          // we don't expect it to receive a backpropagated gradient.
+          // It will not be counted in num_expected_backprops.
+          if (e->IsControlEdge() || !reachable_nodes[e->dst()->id()]) continue;
           if (visited.find(e->dst()) == visited.end()) {
             queue.push_back(e->dst());
             visited.insert(e->dst());
           }
+          ++num_expected_backprops;
         }
       } else {
         // Output node: stop BFS and update `num_expected_backprops` for
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 24af7d567b2..032ab936235 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -364,6 +364,73 @@ TEST_F(GradientsTest, MultipleNodeOutputGrads) {
       test::AsTensor<int>({60, 61, 62, 63, 66, 66, 66, 67}, {4, 2}));
 }
 
+TEST_F(GradientsTest, UnreachableEdgeGradOneOutput) {
+  auto x = Variable(scope_test_, {2, 3}, DT_DOUBLE);
+  auto x_const = Const(scope_test_, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}});
+  auto x_assign = Assign(scope_test_, x, x_const);
+
+  auto y = Variable(scope_test_, {3, 1}, DT_DOUBLE);
+  auto y_const = Const(scope_test_, {{1.0}, {2.0}, {3.0}});
+  auto y_assign = Assign(scope_test_, y, y_const);
+
+  auto m1 = MatMul(scope_test_, x, y);
+
+  auto z = Variable(scope_test_, {1, 3}, DT_DOUBLE);
+  auto z_const = Const(scope_test_, {{9.0, 10.0, 11.0}});
+  auto z_assign = Assign(scope_test_, z, z_const);
+
+  auto m2 = MatMul(scope_test_, y, z);
+
+  auto dm1 = Const(scope_test_, {{0.5}, {0.5}});
+
+  std::vector<Output> grad_outputs;
+  TF_ASSERT_OK(
+      AddSymbolicGradients(scope_test_, {m1}, {y}, {dm1}, &grad_outputs));
+
+  std::vector<Tensor> outputs;
+  test::GetTensors(scope_test_, {x_assign, y_assign, z_assign},
+                   {grad_outputs[0]}, &outputs);
+  // dz/dy = xT * dm1
+  test::ExpectTensorNear<double>(
+      outputs[0], test::AsTensor<double>({2.5, 3.5, 4.5}, {3, 1}), 1e-5);
+}
+
+TEST_F(GradientsTest, UnreachableEdgeGradTwoOutputs) {
+  auto x = Variable(scope_test_, {2, 3}, DT_DOUBLE);
+  auto x_const = Const(scope_test_, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}});
+  auto x_assign = Assign(scope_test_, x, x_const);
+
+  auto y = Variable(scope_test_, {3, 1}, DT_DOUBLE);
+  auto y_const = Const(scope_test_, {{1.0}, {2.0}, {3.0}});
+  auto y_assign = Assign(scope_test_, y, y_const);
+
+  auto m1 = MatMul(scope_test_, x, y);
+
+  auto z = Variable(scope_test_, {1, 3}, DT_DOUBLE);
+  auto z_const = Const(scope_test_, {{9.0, 10.0, 11.0}});
+  auto z_assign = Assign(scope_test_, z, z_const);
+
+  auto m2 = MatMul(scope_test_, y, z);
+
+  auto dm1 = Const(scope_test_, {{0.5}, {0.5}});
+  auto dm2 =
+      Const(scope_test_, {{0.5, 0.5, 0.5}, {0.6, 0.7, 0.8}, {0.6, 0.7, 0.9}});
+
+  std::vector<Output> grad_outputs;
+  TF_ASSERT_OK(AddSymbolicGradients(scope_test_, {m1, m2}, {y}, {dm1, dm2},
+                                    &grad_outputs));
+
+  std::vector<Tensor> outputs;
+  test::GetTensors(scope_test_, {x_assign, y_assign, z_assign},
+                   {grad_outputs[0]}, &outputs);
+
+  // the gradients from m1 and m2 will be summed to compute the gradient
+  // w.r.t y
+  // dz/dy = xT * dm1 + dm2 * zT
+  test::ExpectTensorNear<double>(
+      outputs[0], test::AsTensor<double>({17.5, 24.7, 26.8}, {3, 1}), 1e-5);
+}
+
 // StopGradientSingleOutputMultiEdgeTest tests combinations of valid and
 // 'NoGradient' (induced by StopGradient op) returned along multiple edges from
 // a single nodes output.
diff --git a/tensorflow/cc/framework/testutil.cc b/tensorflow/cc/framework/testutil.cc
index ca78f31db51..25ee08f6762 100644
--- a/tensorflow/cc/framework/testutil.cc
+++ b/tensorflow/cc/framework/testutil.cc
@@ -36,5 +36,19 @@ void GetTensor(const Scope& scope, Output tensor, Tensor* out) {
   *out = outputs[0];
 }
 
+void GetTensors(const Scope& scope, const std::vector<Output>& assign_vars,
+                OutputList tensors, std::vector<Tensor>* out) {
+  ClientSession session(scope);
+  TF_CHECK_OK(session.Run(assign_vars, nullptr));
+  TF_CHECK_OK(session.Run(tensors, out));
+}
+
+void GetTensor(const Scope& scope, const std::vector<Output>& assign_vars,
+               Output tensor, Tensor* out) {
+  std::vector<Tensor> outputs;
+  GetTensors(scope, assign_vars, {std::move(tensor)}, &outputs);
+  *out = outputs[0];
+}
+
 }  // end namespace test
 }  // end namespace tensorflow
diff --git a/tensorflow/cc/framework/testutil.h b/tensorflow/cc/framework/testutil.h
index d027ad3744d..ca57c0f0a40 100644
--- a/tensorflow/cc/framework/testutil.h
+++ b/tensorflow/cc/framework/testutil.h
@@ -26,9 +26,21 @@ namespace test {
 void GetTensors(const Scope& scope, OutputList tensors,
                 std::vector<Tensor>* out);
 
+// Computes the outputs listed in 'tensors', returns the tensors in 'out'.
+// assign_vars are extra outputs that should be run
+// e.g. to assign values to variables.
+void GetTensors(const Scope& scope, const std::vector<Output>& assign_vars,
+                OutputList tensors, std::vector<Tensor>* out);
+
 /// Computes the output 'tensor', returning the resulting tensor in 'out'.
 void GetTensor(const Scope& scope, Output tensor, Tensor* out);
 
+// Computes the output 'tensor', returning the resulting tensor in 'out'.
+// assign_vars are extra outputs that should be run
+// e.g. to assign values to variables.
+void GetTensor(const Scope& scope, const std::vector<Output>& assign_vars,
+               Output tensor, Tensor* out);
+
 }  // namespace test
 }  // namespace tensorflow
 
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 09a15fbe5f1..d90654f2e9a 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -687,6 +687,72 @@ Status MeanGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Mean", MeanGrad);
 
+Status MinOrMaxGrad(const Scope& scope, const Operation& op,
+                    const std::vector<Output>& grad_inputs,
+                    std::vector<Output>* grad_outputs) {
+  // The partial derivative for any input along a "reduced" dimension
+  // is 1 when it is the min (or max) and 0 everywhere else. So the
+  // gradient calculation is identical for both operators.
+  //
+  // There's a special case for propagating gradients when there are
+  // multiple minima (or maxima) - we choose to divide the gradient
+  // equally among all matching inputs.
+  //
+  // Please note this comment
+  // https://github.com/tensorflow/tensorflow/issues/4886#issuecomment-256836063
+  // for details.
+
+  // Running example:
+  // input: [[5, 5, 5],
+  //         [1, 2, -3]]
+  // reduction_indices: [1]
+  auto input = op.input(0);
+  auto reduction_indices = op.input(1);
+
+  // [2, 3]
+  auto input_shape = Shape(scope, input);
+
+  // [2, 1]
+  auto output_shape_kept_dims =
+      ReducedShapeHelper(scope, input_shape, reduction_indices);
+
+  // for op=min (say)
+  // output = [5, -3]
+  // y = [[5],
+  //      [-3]]
+  auto y = Reshape(scope, op.output(0), output_shape_kept_dims);
+
+  // reshape([g1, g2], [2, 1]) = [[g1],
+  //                              [g2]]
+  auto grad = Reshape(scope, grad_inputs[0], output_shape_kept_dims);
+
+  // indicators = equal(y, input)
+  //  = equal([[5],   [[5, 5, 5],
+  //           [-3]],  [1, 2, -3]])
+  //  = [[1, 1, 1],
+  //     [0, 0, 1]]
+  auto indicators = Cast(scope, Equal(scope, y, input), grad_inputs[0].type());
+
+  // [[3],
+  //  [1]]
+  auto num_selected = Reshape(scope, Sum(scope, indicators, reduction_indices),
+                              output_shape_kept_dims);
+
+  // [[1/3, 1/3, 1/3],
+  //  [0, 0, 1]]
+  auto scale = Div(scope, indicators, num_selected);
+
+  // [[g1/3, g1/3, g1/3],
+  //  [0, 0, g2]]
+  grad_outputs->push_back(Mul(scope, scale, grad));
+
+  // Stop propagation along reduction_indices
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Min", MinOrMaxGrad);
+REGISTER_GRADIENT_OP("Max", MinOrMaxGrad);
+
 // MatMulGrad helper function used to compute two MatMul operations
 // based on input matrix transposition combinations.
 Status MatMulGradHelper(const Scope& scope, const bool is_batch,
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 62b59b25c7a..5b1558dd820 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -955,6 +955,55 @@ TEST_F(NaryGradTest, Mean) {
   RunTest({x}, {x_shape}, {y}, {y_shape});
 }
 
+TEST_F(NaryGradTest, Min) {
+  TensorShape x_shape({2, 3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto y = Min(scope_, x, {-1});
+  // y's shape is the result of reducing x along axes -1 (= 1)
+  TensorShape y_shape({2});
+  Tensor x_init_value =
+      test::AsTensor<float>({0.5f, 0.7f, 0.2f, 1.0f, 1.5f, -2.8f}, x_shape);
+  RunTest(x, x_init_value, y, y_shape);
+}
+
+TEST_F(NaryGradTest, Max) {
+  TensorShape x_shape({2, 3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto y = Max(scope_, x, {-1});
+  // y's shape is the result of reducing x along axes -1 (= 1)
+  TensorShape y_shape({2});
+  Tensor x_init_value =
+      test::AsTensor<float>({0.5f, 0.7f, 0.2f, 1.0f, 1.5f, -2.8f}, x_shape);
+  RunTest(x, x_init_value, y, y_shape);
+}
+
+TEST_F(NaryGradTest, MinMulti) {
+  // Test gradient when there are multiple minima.
+  // Note that we cannot directly use a test Tensor with multiple
+  // minima, as the numeric estimator will calculate incorrect
+  // gradients when perturbing each entry in the Tensor (which then
+  // changes how many minima exist.)
+  // Instead, we use a single input that broadcast-multiplies a larger
+  // tensor with equal values, and apply reduce_min to the multiplied
+  // result.
+  TensorShape x_shape({1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto all_same = Mul(scope_, Const(scope_, {1.f, 1.f, 1.f}), x);
+  auto y = Min(scope_, all_same, {0});
+  // y is a [3] shaped tensor reduced along dimension 0, so it is [1] shaped
+  TensorShape y_shape({1});
+  RunTest({x}, {x_shape}, {y}, {y_shape});
+}
+
+TEST_F(NaryGradTest, MaxMulti) {
+  TensorShape x_shape({1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto all_same = Mul(scope_, Const(scope_, {1.f, 1.f, 1.f}), x);
+  auto y = Max(scope_, all_same, {0});
+  TensorShape y_shape({1});
+  RunTest({x}, {x_shape}, {y}, {y_shape});
+}
+
 TEST_F(NaryGradTest, AddN) {
   TensorShape shape({3, 2, 5});
   std::vector<Output> xs;
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index e349aefd4cb..e6862f0d9dd 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -52,6 +52,12 @@ class BinaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      self._testBinary(
+          lambda x, y: math_ops.approximate_equal(x, y, tolerance=0.0001),
+          np.array([[[[-1, 2.00009999], [-3, 4.01]]]], dtype=dtype),
+          np.array([[[[-1.001, 2], [-3.00009, 4]]]], dtype=dtype),
+          expected=np.array([[[[False, True], [True, False]]]], dtype=dtype))
+
       self._testBinary(
           gen_math_ops._real_div,
           np.array([3, 3, -1.5, -8, 44], dtype=dtype),
@@ -82,6 +88,12 @@ class BinaryOpsTest(XLATestCase):
           dtype(4),
           expected=np.array([[16], [81]], dtype=dtype))
 
+      self._testBinary(
+          gen_math_ops._reciprocal_grad,
+          np.array([4, -3, -2, 1], dtype=dtype),
+          np.array([5, -6, 7, -8], dtype=dtype),
+          expected=np.array([-80, 54, -28, 8], dtype=dtype))
+
       self._testBinary(
           gen_math_ops._sigmoid_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
@@ -107,6 +119,13 @@ class BinaryOpsTest(XLATestCase):
           expected=np.array(
               [3.97322869, 2.99258232, 1.99817801, 0.99966466], dtype=dtype))
 
+      self._testBinary(
+          gen_nn_ops._softsign_grad,
+          np.array([4, 3, 2, 1], dtype=dtype),
+          np.array([5, 6, 7, 8], dtype=dtype),
+          expected=np.array(
+              [0.11111111, 0.06122449, 0.03125, 0.01234568], dtype=dtype))
+
       self._testBinary(
           gen_math_ops._tanh_grad,
           np.array([4, 3, 2, 1], dtype=dtype),
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index a342e37e0ee..49c1699b6ed 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -888,6 +888,16 @@ TEST_F(OpTest, Any) {
   });
 }
 
+TEST_F(OpTest, ApproximateEqual) {
+  Repeatedly([this]() {
+    auto dims = RandomDims();
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ApproximateEqual")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, Asinh) {
   Repeatedly([this]() {
     return ExpectTfAndXlaOutputsAreClose(
@@ -1662,11 +1672,9 @@ TEST_F(OpTest, GreaterEqual) {
 
 TEST_F(OpTest, L2Loss) {
   Repeatedly([this]() {
-    DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
-    // TODO(b/31644876): scalars currently crash.
-    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("L2Loss")
-                                             .RandomInput(type, RandomDims(1))
-                                             .Attr("T", type));
+    DataType type = DT_FLOAT;
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("L2Loss").RandomInput(type).Attr("T", type));
   });
 }
 
@@ -2165,6 +2173,15 @@ TEST_F(OpTest, Reciprocal) {
   });
 }
 
+TEST_F(OpTest, ReciprocalGrad) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims();
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("ReciprocalGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
+  });
+}
 TEST_F(OpTest, Relu) {
   Repeatedly([this]() {
     return ExpectTfAndXlaOutputsAreClose(
@@ -2250,6 +2267,13 @@ TEST_F(OpTest, ReverseV2) {
   });
 }
 
+TEST_F(OpTest, Rint) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Rint").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, Round) {
   Repeatedly([this]() {
     return ExpectTfAndXlaOutputsAreClose(
@@ -2402,6 +2426,23 @@ TEST_F(OpTest, SoftplusGrad) {
   });
 }
 
+TEST_F(OpTest, Softsign) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Softsign").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, SoftsignGrad) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims();
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SoftsignGrad")
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .RandomInput(DT_FLOAT, dims)
+                                             .Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, SpaceToBatch) {
   Repeatedly([this]() {
     std::vector<int64> block_dims = RandomDims(4, 4, 0, 5);
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index ca2a438005f..b21f1998a5d 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import unittest
+
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -161,12 +163,17 @@ class UnaryOpsTest(XLATestCase):
           np.array([[-1.7, 1.2]], dtype=dtype),
           expected=np.array([[-2, 1]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.is_finite,
+          np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]],
+                   dtype=dtype),
+          expected=np.array([[0, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=np.bool))
+
       # Tests for tf.nn ops.
       self._assertOpOutputMatchesExpected(
           nn_ops.l2_loss, np.array([[[]]], dtype=dtype), expected=dtype(0))
 
-      # TODO(b/31644876): enable this test case when fixed.
-      # self._assertOpOutputMatchesExpected(tf.nn.l2_loss, dtype(4), dtype(10))
+      self._assertOpOutputMatchesExpected(nn_ops.l2_loss, dtype(4), dtype(8))
 
       self._assertOpOutputMatchesExpected(
           nn_ops.l2_loss, np.array([[-2, 4]], dtype=dtype), expected=dtype(10))
@@ -198,6 +205,12 @@ class UnaryOpsTest(XLATestCase):
           np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
           expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.rint,
+          np.array([[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5],
+                    [0.5, 1.5, 2.5, 3.5]], dtype=dtype),
+          expected=np.array([[-2, 1, 4, 0], [-4, -2, -2, 0], [0, 2, 2, 4]],
+                            dtype=dtype))
       self._assertOpOutputMatchesExpected(
           math_ops.round,
           np.array([[-1.7, 1.2, 4.0, 0.0], [-3.5, -2.5, -1.5, -0.5],
@@ -301,6 +314,12 @@ class UnaryOpsTest(XLATestCase):
           np.array([[-2, 0, 8]], dtype=dtype),
           expected=np.array([[0.126928, 0.6931472, 8.0003354]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          nn_ops.softsign,
+          np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
+          expected=np.array([[-0.66666669, -0.5, 0, 0.5, 0.66666669]],
+                            dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.is_finite,
           np.array(
@@ -335,6 +354,23 @@ class UnaryOpsTest(XLATestCase):
           np.array([[4, 3], [2, 1]], dtype=dtype),
           expected=np.array([[1, 1], [1, 1]], dtype=dtype))
 
+  # TODO(phawkins): these tests fail unless fastmath optimizations
+  # are disabled. Use more robust IsInf/IsNaN detection and enable these
+  # tests.
+  @unittest.skip("test case fails in fast-math mode")
+  def testIsInfAndIsNan(self):
+    for dtype in self.float_types:
+      self._assertOpOutputMatchesExpected(
+          math_ops.is_inf,
+          np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]],
+                   dtype=dtype),
+          expected=np.array([[1, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=np.bool))
+      self._assertOpOutputMatchesExpected(
+          math_ops.is_nan,
+          np.array([[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]],
+                   dtype=dtype),
+          expected=np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=np.bool))
+
   def testLogicalOps(self):
     self._assertOpOutputMatchesExpected(
         math_ops.logical_not,
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d09e721c936..6e6c5dc17f5 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -31,7 +31,6 @@ tf_kernel_library(
         "function_ops.cc",
         "gather_op.cc",
         "identity_op.cc",
-        "is_finite_op.cc",
         "l2loss_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index f9bb1e2fb1d..58538b45137 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -102,6 +102,7 @@ XLA_MAKE_BINARY(Mod, b->Rem(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(Maximum, b->Max(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(Minimum, b->Min(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(RealDiv, b->Div(lhs, rhs, extend_dimensions));
+XLA_MAKE_BINARY(ReciprocalGrad, b->Neg(b->Mul(rhs, b->Mul(lhs, lhs))));
 XLA_MAKE_BINARY(
     RsqrtGrad,
     b->Mul(b->Pow(lhs, XlaHelpers::IntegerLiteral(b, input_type(0), 3)),
@@ -140,6 +141,11 @@ XLA_MAKE_BINARY(SoftplusGrad,
                 b->Div(lhs, b->Add(b->Exp(b->Neg(rhs)),
                                    XlaHelpers::One(b, input_type(1)))));
 
+// softsigngrad(gradients, features) = gradients / (1 + abs(features)) ** 2
+XLA_MAKE_BINARY(SoftsignGrad,
+                b->Div(lhs, Square(b, b->Add(XlaHelpers::One(b, input_type(0)),
+                                             b->Abs(rhs)))));
+
 XLA_MAKE_BINARY(TanhGrad, b->Mul(rhs, b->Sub(XlaHelpers::One(b, input_type(0)),
                                              b->Mul(lhs, lhs))));
 
@@ -147,5 +153,24 @@ XLA_MAKE_BINARY(Pow, b->Pow(lhs, rhs, extend_dimensions));
 
 #undef XLA_MAKE_BINARY
 
+class ApproximateEqualOp : public XlaOpKernel {
+ public:
+  explicit ApproximateEqualOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tolerance", &tolerance_));
+  }
+
+  // Computes the max of the scalar input x and 0.
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    auto result = b->Lt(b->Abs(b->Sub(ctx->Input(0), ctx->Input(1))),
+                        XlaHelpers::FloatLiteral(b, input_type(0), tolerance_));
+    ctx->SetOutput(0, result);
+  }
+
+ private:
+  float tolerance_;
+};
+REGISTER_XLA_OP(Name("ApproximateEqual"), ApproximateEqualOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc b/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc
deleted file mode 100644
index 788dcee5443..00000000000
--- a/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/bcast.h"
-
-namespace tensorflow {
-namespace {
-
-class IsFiniteOp : public XlaOpKernel {
- public:
-  explicit IsFiniteOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle input = ctx->Input(0);
-    ctx->SetOutput(0, ctx->builder()->IsFinite(input));
-  }
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(IsFiniteOp);
-};
-
-REGISTER_XLA_OP(Name("IsFinite"), IsFiniteOp);
-
-}  // anonymous namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 7b39f0533b4..6b8f5ec7b33 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -73,8 +73,12 @@ XLAJIT_MAKE_UNARY(Exp, b->Exp(x));
 XLAJIT_MAKE_UNARY(Expm1, b->Sub(b->Exp(x), XlaHelpers::One(b, input_type(0))));
 
 XLAJIT_MAKE_UNARY(Floor, b->Floor(x));
-// Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-XLAJIT_MAKE_UNARY(Sign, b->Sign(x));
+XLAJIT_MAKE_UNARY(IsFinite, b->IsFinite(x));
+XLAJIT_MAKE_UNARY(IsInf, b->Eq(b->Abs(x),
+                               XlaHelpers::FloatLiteral(
+                                   b, input_type(0),
+                                   std::numeric_limits<double>::infinity())));
+XLAJIT_MAKE_UNARY(IsNan, b->Ne(x, x));
 // Return 1/x
 XLAJIT_MAKE_UNARY(Inv, b->Div(XlaHelpers::One(b, input_type(0)), x));
 XLAJIT_MAKE_UNARY(Reciprocal, b->Div(XlaHelpers::One(b, input_type(0)), x));
@@ -105,6 +109,12 @@ static xla::ComputationDataHandle Round(xla::ComputationBuilder* b,
                    b->Add(round_val, one), round_val);
 }
 
+XLAJIT_MAKE_UNARY(Rint, Round(b, input_type(0), x));
+XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x));
+
+XLAJIT_MAKE_UNARY(Rsqrt,
+                  b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5)));
+
 // Expresses sigmoid as a rescaled tanh: sigmoid(x) == (tanh(x/2) + 1) / 2.
 static xla::ComputationDataHandle Sigmoid(xla::ComputationBuilder* b,
                                           DataType dtype,
@@ -112,16 +122,19 @@ static xla::ComputationDataHandle Sigmoid(xla::ComputationBuilder* b,
   auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5);
   return b->Add(half, b->Mul(half, b->Tanh(b->Mul(half, x))));
 }
-
-XLAJIT_MAKE_UNARY(Round, Round(b, input_type(0), x));
-XLAJIT_MAKE_UNARY(Rsqrt,
-                  b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5)));
 XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(b, input_type(0), x));
+
+// Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0.
+XLAJIT_MAKE_UNARY(Sign, b->Sign(x));
 XLAJIT_MAKE_UNARY(Sinh,
                   b->Mul(b->Sub(b->Exp(x), b->Exp(b->Neg(x))),
                          XlaHelpers::FloatLiteral(b, input_type(0), 0.5)));
 XLAJIT_MAKE_UNARY(Softplus,
                   b->Log(b->Add(b->Exp(x), XlaHelpers::One(b, input_type(0)))));
+// softsign(x) = x / (abs(x) + 1)
+XLAJIT_MAKE_UNARY(Softsign,
+                  b->Div(x,
+                         b->Add(b->Abs(x), XlaHelpers::One(b, input_type(0)))));
 XLAJIT_MAKE_UNARY(Sqrt,
                   b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), 0.5)));
 XLAJIT_MAKE_UNARY(Square, b->Mul(x, x));
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 7e59b70abe0..98cc3401c14 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -847,6 +847,7 @@ cc_test(
     srcs = ["hlo_ordering_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_dataflow_analysis",
         ":hlo_ordering",
         ":hlo_scheduling",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index bef4ecd480d..40fa3a67bde 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -241,7 +241,7 @@ Status Executor::Run() {
         completion_queue_.pop_front();
         break;
       }
-    } while (1);
+    } while (true);
     TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
                         assignment_->GetUniqueTopLevelSlice(instruction));
     void* result_buffer =
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
index 669ebb55bec..6efd0bcee58 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
@@ -24,16 +24,14 @@ limitations under the License.
 
 namespace xla {
 
-Status DfsHloVisitor::HandleElementwiseUnary(HloInstruction* hlo,
-                                             HloOpcode opcode) {
+Status DfsHloVisitor::HandleElementwiseUnary(HloInstruction* hlo) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseUnary: %s",
-                       HloOpcodeString(opcode).c_str());
+                       HloOpcodeString(hlo->opcode()).c_str());
 }
 
-Status DfsHloVisitor::HandleElementwiseBinary(HloInstruction* hlo,
-                                              HloOpcode opcode) {
+Status DfsHloVisitor::HandleElementwiseBinary(HloInstruction* hlo) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseBinary: %s",
-                       HloOpcodeString(opcode).c_str());
+                       HloOpcodeString(hlo->opcode()).c_str());
 }
 
 DfsHloVisitor::VisitState DfsHloVisitor::GetVisitState(
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index a1a3a882c7a..2f21043a1d3 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -63,37 +63,37 @@ class DfsHloVisitor {
   // These routines are self-descriptive, see class comment for usage
   // information.
 
-  virtual Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode);
-  virtual Status HandleElementwiseBinary(HloInstruction* hlo, HloOpcode opcode);
+  virtual Status HandleElementwiseUnary(HloInstruction* hlo);
+  virtual Status HandleElementwiseBinary(HloInstruction* hlo);
   virtual Status HandleClamp(HloInstruction* clamp, HloInstruction* min,
                              HloInstruction* arg, HloInstruction* max) = 0;
   virtual Status HandleSelect(HloInstruction* select, HloInstruction* pred,
                               HloInstruction* on_true,
                               HloInstruction* on_false) = 0;
   virtual Status HandleMaximum(HloInstruction* maximum) {
-    return HandleElementwiseBinary(maximum, HloOpcode::kMaximum);
+    return HandleElementwiseBinary(maximum);
   }
   virtual Status HandleMinimum(HloInstruction* minimum) {
-    return HandleElementwiseBinary(minimum, HloOpcode::kMinimum);
+    return HandleElementwiseBinary(minimum);
   }
   virtual Status HandleConcatenate(
       HloInstruction* concatenate,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands) = 0;
   virtual Status HandleConvert(HloInstruction* convert) {
-    return HandleElementwiseUnary(convert, HloOpcode::kConvert);
+    return HandleElementwiseUnary(convert);
   }
   virtual Status HandleCopy(HloInstruction* copy) {
-    return HandleElementwiseUnary(copy, HloOpcode::kCopy);
+    return HandleElementwiseUnary(copy);
   }
   virtual Status HandleMultiply(HloInstruction* multiply, HloInstruction* lhs,
                                 HloInstruction* rhs) {
-    return HandleElementwiseBinary(multiply, HloOpcode::kMultiply);
+    return HandleElementwiseBinary(multiply);
   }
   virtual Status HandleDot(HloInstruction* dot, HloInstruction* lhs,
                            HloInstruction* rhs) = 0;
   virtual Status HandlePower(HloInstruction* power, HloInstruction* lhs,
                              HloInstruction* rhs) {
-    return HandleElementwiseBinary(power, HloOpcode::kPower);
+    return HandleElementwiseBinary(power);
   }
   virtual Status HandleConvolution(HloInstruction* convolution,
                                    HloInstruction* lhs, HloInstruction* rhs,
@@ -101,73 +101,72 @@ class DfsHloVisitor {
   virtual Status HandleCrossReplicaSum(HloInstruction* crs) = 0;
   virtual Status HandleCompare(HloInstruction* compare, HloOpcode opcode,
                                HloInstruction* lhs, HloInstruction* rhs) {
-    return HandleElementwiseBinary(compare, opcode);
+    return HandleElementwiseBinary(compare);
   }
   virtual Status HandleAdd(HloInstruction* add, HloInstruction* lhs,
                            HloInstruction* rhs) {
-    return HandleElementwiseBinary(add, HloOpcode::kAdd);
+    return HandleElementwiseBinary(add);
   }
   virtual Status HandleDivide(HloInstruction* divide, HloInstruction* lhs,
                               HloInstruction* rhs) {
-    return HandleElementwiseBinary(divide, HloOpcode::kDivide);
+    return HandleElementwiseBinary(divide);
   }
   virtual Status HandleRemainder(HloInstruction* remainder, HloInstruction* lhs,
                                  HloInstruction* rhs) {
-    return HandleElementwiseBinary(remainder, HloOpcode::kRemainder);
+    return HandleElementwiseBinary(remainder);
   }
   virtual Status HandleSubtract(HloInstruction* subtract, HloInstruction* lhs,
                                 HloInstruction* rhs) {
-    return HandleElementwiseBinary(subtract, HloOpcode::kSubtract);
+    return HandleElementwiseBinary(subtract);
   }
   virtual Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
-    return HandleElementwiseUnary(abs, HloOpcode::kAbs);
+    return HandleElementwiseUnary(abs);
   }
   virtual Status HandleSign(HloInstruction* sign, HloInstruction* operand) {
-    return HandleElementwiseUnary(sign, HloOpcode::kSign);
+    return HandleElementwiseUnary(sign);
   }
   virtual Status HandleNegate(HloInstruction* negate, HloInstruction* operand) {
-    return HandleElementwiseUnary(negate, HloOpcode::kNegate);
+    return HandleElementwiseUnary(negate);
   }
   virtual Status HandleExp(HloInstruction* exp, HloInstruction* operand) {
-    return HandleElementwiseUnary(exp, HloOpcode::kExp);
+    return HandleElementwiseUnary(exp);
   }
   virtual Status HandleFloor(HloInstruction* floor, HloInstruction* operand) {
-    return HandleElementwiseUnary(floor, HloOpcode::kFloor);
+    return HandleElementwiseUnary(floor);
   }
   virtual Status HandleCeil(HloInstruction* ceil, HloInstruction* operand) {
-    return HandleElementwiseUnary(ceil, HloOpcode::kCeil);
+    return HandleElementwiseUnary(ceil);
   }
   virtual Status HandleLog(HloInstruction* log, HloInstruction* operand) {
-    return HandleElementwiseUnary(log, HloOpcode::kLog);
+    return HandleElementwiseUnary(log);
   }
   virtual Status HandleCos(HloInstruction* cos, HloInstruction* operand) {
-    return HandleElementwiseUnary(cos, HloOpcode::kCos);
+    return HandleElementwiseUnary(cos);
   }
   virtual Status HandleSin(HloInstruction* sin, HloInstruction* operand) {
-    return HandleElementwiseUnary(sin, HloOpcode::kSin);
+    return HandleElementwiseUnary(sin);
   }
   virtual Status HandleTanh(HloInstruction* tanh, HloInstruction* operand) {
-    return HandleElementwiseUnary(tanh, HloOpcode::kTanh);
+    return HandleElementwiseUnary(tanh);
   }
   virtual Status HandleIsFinite(HloInstruction* is_finite,
                                 HloInstruction* operand) {
-    return HandleElementwiseUnary(is_finite, HloOpcode::kIsFinite);
+    return HandleElementwiseUnary(is_finite);
   }
   virtual Status HandleLogicalAnd(HloInstruction* logical_and,
                                   HloInstruction* lhs, HloInstruction* rhs) {
-    return HandleElementwiseBinary(logical_and, HloOpcode::kLogicalAnd);
+    return HandleElementwiseBinary(logical_and);
   }
   virtual Status HandleLogicalNot(HloInstruction* logical_not,
                                   HloInstruction* operand) {
-    return HandleElementwiseUnary(logical_not, HloOpcode::kLogicalNot);
+    return HandleElementwiseUnary(logical_not);
   }
   virtual Status HandleLogicalOr(HloInstruction* logical_or,
                                  HloInstruction* lhs, HloInstruction* rhs) {
-    return HandleElementwiseBinary(logical_or, HloOpcode::kLogicalOr);
+    return HandleElementwiseBinary(logical_or);
   }
   virtual Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return HandleElementwiseUnary(reduce_precision,
-                                  HloOpcode::kReducePrecision);
+    return HandleElementwiseUnary(reduce_precision);
   }
 
   virtual Status HandleInfeed(HloInstruction* infeed) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 10f8ae9b044..a5fe1205984 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -41,12 +41,10 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
   // Default action performed on HloInstruction.
   virtual Status DefaultAction(HloInstruction* hlo_instruction) = 0;
 
-  Status HandleElementwiseUnary(HloInstruction* hlo,
-                                HloOpcode opcode) override {
+  Status HandleElementwiseUnary(HloInstruction* hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleElementwiseBinary(HloInstruction* hlo,
-                                 HloOpcode opcode) override {
+  Status HandleElementwiseBinary(HloInstruction* hlo) override {
     return DefaultAction(hlo);
   }
 
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index b02138325ed..350dbc321fb 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -709,7 +709,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
         } else {
           auto r = ir_builder_->CreateSub(q, p);
           auto leading_zeros = llvm_ir::EmitCallToIntrinsic(
-              llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(1)},
+              llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(true)},
               {param_ir_type}, ir_builder_);
           auto in_block = ir_builder_->GetInsertBlock();
 
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index d044462f9a7..5edaaba3ebe 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -334,7 +334,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder_);
 
         IrArray::Index input_index(index.size());
-        llvm::Value* in_bounds = ir_builder_->getInt1(1);
+        llvm::Value* in_bounds = ir_builder_->getInt1(true);
         for (size_t i = 0; i < index.size(); ++i) {
           llvm::Value* stridden_index = ir_builder_->CreateNSWMul(
               index[i], ir_builder_->getInt64(window.dimensions(i).stride()));
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 2a999f52f01..2e7765c4c61 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -389,7 +389,7 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
   // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
   // again after the standard optimization passes [http://b/13329423].
-  // TODO(jingyue): SROA may further expose more optimization opportunities, such
+  // TODO(jingyue): SROA may further expose more optimization opportunities such
   // as more precise alias analysis and more function inlining (SROA may change
   // the inlining cost of a function). For now, running SROA already emits good
   // enough code for the evaluated benchmarks. We may want to run more
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 0beea423798..3dd8ac6dc5f 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -37,6 +37,230 @@ namespace xla {
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 
+// Data structure used to construct the alias analysis. Thrown away after alias
+// analysis is complete. This data structure keeps track of which sets of
+// HloValues must be in the same HloBuffer. This is maintained as a map from a
+// buffer identifier (BufferNumber) to set of HLoValues.
+//
+// Initially each value is its own buffer. In MergeAliasedBuffers, sets of
+// values which must share the same buffer are merged together. The end result
+// is a partitioning of all HloValues into sets where each set needs its own
+// HloBuffer. By performing this analysis without constructing HloBuffers on the
+// fly, we can after-the-fact construct a vector of contiguously numbered
+// HloBuffers after the buffer requirement has been determined.
+class BufferValueMap {
+ public:
+  // A unique identifier for a set of colocated values which must share the same
+  // buffer. This is not necessarily the same as the HloBuffer::Id which will
+  // ultimately contain the values. The reason is that HloBuffer::Id's are
+  // contiguous, while BufferNumbers may not be. BufferNumbers may not be
+  // dense because buffers may be created and destroyed during the analysis
+  // construction process.
+  using BufferNumber = int64;
+
+  explicit BufferValueMap(const HloDataflowAnalysis& dataflow)
+      : dataflow_(dataflow) {
+    buffers_.reserve(dataflow_.values().size());
+    value_to_buffer_number_.reserve(dataflow_.values().size());
+    for (const HloValue* value : dataflow_.values()) {
+      BufferNumber buffer_number = next_buffer_number_++;
+      buffers_[buffer_number].insert(value);
+      value_to_buffer_number_[value] = buffer_number;
+    }
+  }
+
+  // Merge together sets of HloValues which must be in the same HloBuffer
+  // because of aliasing rules (eg, in-place kWhile instruction).
+  void MergeAliasedBuffers() {
+    for (const HloValue* value : dataflow_.values()) {
+      VLOG(3) << "Merging colocated values, value: " << value->ToShortString();
+
+      // Gather the set of buffers with aliasing rules (eg, kWhile) which this
+      // value must be contained in.
+      std::vector<BufferNumber> aliased_buffers = ComputeAliasedBuffers(*value);
+
+      BufferNumber current_buffer = value_to_buffer_number_.at(value);
+      if (aliased_buffers.empty()) {
+        // The buffer containing 'value' aliases no other buffers. If the buffer
+        // containing 'value' already only contains 'value', then no change is
+        // necessary. If the buffer containing 'value' does contain other
+        // values, then remove 'value' from the buffer and create a new buffer
+        // containing only 'value'
+        if (buffers_.at(current_buffer).size() == 1) {
+          CHECK_EQ(*buffers_.at(current_buffer).begin(), value);
+        } else {
+          MoveValueToNewBuffer(*value);
+        }
+      } else {
+        // If multiple buffers are aliased merge these buffers together into a
+        // single buffer (arbitrarily chosen as the first buffer in the vector).
+        if (aliased_buffers.size() > 1) {
+          for (int64 i = 1; i < aliased_buffers.size(); ++i) {
+            MergeBuffers(/*from=*/aliased_buffers[i],
+                         /*to=*/aliased_buffers[0]);
+          }
+        }
+        BufferNumber new_buffer = aliased_buffers[0];
+        if (current_buffer != new_buffer) {
+          MoveValueToBuffer(*value, new_buffer);
+        }
+      }
+    }
+  }
+
+  // Compute and return a sorted vector of all BufferNumbers. Can be used to
+  // iterate through all buffers stabily.
+  std::vector<BufferNumber> ComputeSortedBufferNumbers() const {
+    std::vector<BufferNumber> buffer_numbers;
+    for (const auto& pair : buffers_) {
+      buffer_numbers.push_back(pair.first);
+    }
+    std::sort(buffer_numbers.begin(), buffer_numbers.end());
+    return buffer_numbers;
+  }
+
+  // Return a set of all the values in the given buffer.
+  const tensorflow::gtl::FlatSet<const HloValue*>& GetValuesInBuffer(
+      BufferNumber buffer_number) const {
+    return buffers_.at(buffer_number);
+  }
+
+ private:
+  // Create a new buffer.
+  void NewBuffer(const HloValue& value) {
+    BufferNumber buffer_number = next_buffer_number_++;
+    buffers_[buffer_number].insert(&value);
+    value_to_buffer_number_[&value] = buffer_number;
+  }
+
+  // Move the given value into a new buffer containing only the value.
+  void MoveValueToNewBuffer(const HloValue& value) {
+    BufferNumber new_buffer_number = next_buffer_number_++;
+    buffers_[new_buffer_number];
+    MoveValueToBuffer(value, new_buffer_number);
+  }
+
+  // Move the given value into the given buffer.
+  void MoveValueToBuffer(const HloValue& value, BufferNumber buffer_number) {
+    BufferNumber old_buffer_number = value_to_buffer_number_.at(&value);
+    buffers_.at(old_buffer_number).erase(&value);
+    if (buffers_.at(old_buffer_number).empty()) {
+      buffers_.erase(old_buffer_number);
+    }
+
+    buffers_.at(buffer_number).insert(&value);
+    value_to_buffer_number_.at(&value) = buffer_number;
+  }
+
+  // Merge the buffer 'from' into the buffer 'to'.
+  void MergeBuffers(BufferNumber from, BufferNumber to) {
+    auto& from_value_set = buffers_.at(from);
+    buffers_.at(to).insert(from_value_set.begin(), from_value_set.end());
+    // NOTE: using a union-find algorithm to hold the colocated values might be
+    // faster.
+    for (const HloValue* value : from_value_set) {
+      value_to_buffer_number_.at(value) = to;
+    }
+    buffers_.erase(from);
+  }
+
+  BufferNumber GetBufferForValue(const HloValue& value) {
+    return value_to_buffer_number_.at(&value);
+  }
+
+  // Compute and return a vector of buffers that the given value must be
+  // contained in due to HLO aliasing rules.
+  std::vector<BufferNumber> ComputeAliasedBuffers(const HloValue& value) {
+    // Value is init of a while (use is while).
+    std::vector<BufferNumber> aliased_buffers;
+    for (const HloUse& use : value.uses()) {
+      VLOG(1) << "use of value " << value.ToShortString() << ": " << use;
+      if (use.instruction->opcode() == HloOpcode::kWhile) {
+        // Determine the while value that this shares a buffer with.
+        const HloValue& while_value =
+            dataflow_.GetUniqueValueAt(use.instruction, use.operand_index);
+        aliased_buffers.push_back(GetBufferForValue(while_value));
+        VLOG(3) << "  value is init value to a while; must share buffer with "
+                   "while value "
+                << while_value.ToShortString();
+      }
+    }
+
+    // Value is a parameter of a while body/condition.
+    if (value.defining_instruction()->opcode() == HloOpcode::kParameter) {
+      const HloComputation* computation =
+          value.defining_instruction()->parent();
+      const CallGraphNode& call_graph_node =
+          dataflow_.call_graph().GetNode(computation);
+      for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+        if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+          // Call graph must have been flattened.
+          CHECK_EQ(call_graph_node.caller_callsites().size(), 1);
+
+          const HloValue& while_value = dataflow_.GetUniqueValueAt(
+              callsite.instruction(), value.defining_index());
+          VLOG(3) << "  value is parameter value of the body or condition of a "
+                     "while; must share buffer with while value "
+                  << while_value.ToShortString();
+          aliased_buffers.push_back(GetBufferForValue(while_value));
+        }
+      }
+    }
+
+    // Value is the root of a while body.
+    for (const HloPosition& position : value.positions()) {
+      const HloComputation* computation = position.instruction->parent();
+      const CallGraphNode& call_graph_node =
+          dataflow_.call_graph().GetNode(computation);
+      if (position.instruction == computation->root_instruction()) {
+        for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+          if (callsite.instruction()->opcode() == HloOpcode::kWhile &&
+              callsite.instruction()->while_body() == computation) {
+            // Call graph must have been flattened.
+            CHECK_EQ(call_graph_node.caller_callsites().size(), 1);
+
+            const HloValue& while_value = dataflow_.GetUniqueValueAt(
+                callsite.instruction(), position.index);
+            VLOG(3) << "  value is root the body computation of a while; must "
+                       "share buffer with while value "
+                    << while_value.ToShortString();
+            aliased_buffers.push_back(GetBufferForValue(while_value));
+          }
+        }
+      }
+    }
+
+    // Value is the output of the while instruction itself.
+    if (value.defining_instruction()->opcode() == HloOpcode::kWhile) {
+      VLOG(3) << "  value is output of a while instruction";
+      aliased_buffers.push_back(GetBufferForValue(value));
+    }
+
+    // Uniquify aliased buffers.
+    std::sort(aliased_buffers.begin(), aliased_buffers.end());
+    aliased_buffers.erase(
+        std::unique(aliased_buffers.begin(), aliased_buffers.end()),
+        aliased_buffers.end());
+
+    return aliased_buffers;
+  }
+
+  // Dataflow analysis used to construct the buffer map.
+  const HloDataflowAnalysis& dataflow_;
+
+  // A map containing the set of values contained in each buffer.
+  tensorflow::gtl::FlatMap<BufferNumber,
+                           tensorflow::gtl::FlatSet<const HloValue*>>
+      buffers_;
+
+  // A map indicating which buffer each value is contained in.
+  tensorflow::gtl::FlatMap<const HloValue*, BufferNumber>
+      value_to_buffer_number_;
+
+  // The buffer number of the next buffer to be created.
+  BufferNumber next_buffer_number_ = 0;
+};
+
 HloAliasAnalysis::HloAliasAnalysis(HloModule* module) : module_(module) {}
 
 const HloBuffer& HloAliasAnalysis::GetUniqueBufferAt(
@@ -99,10 +323,11 @@ bool HloAliasAnalysis::InstructionBuffersAreDistinct(
       }
     } else {
       // It's possible for multiple values at this index to have the same
-      // HloBuffer. This does not result in non-distictness. To account for this
-      // case, add all of the buffers at this index after checking whether each
-      // buffer exists at an earlier index. This is a corner case, however, as
-      // the number of values at an index is almost always one.
+      // HloBuffer. This does not result in non-distictness. To account for
+      // this case, add all of the buffers at this index after checking
+      // whether each buffer exists at an earlier index. This is a corner
+      // case, however, as the number of values at an index is almost always
+      // one.
       std::vector<const HloBuffer*> buffers_at_this_index;
       for (const HloValue* value : value_set.values()) {
         const HloBuffer* buffer = &GetBufferContainingValue(*value);
@@ -118,15 +343,6 @@ bool HloAliasAnalysis::InstructionBuffersAreDistinct(
   return true;
 }
 
-void HloAliasAnalysis::InitializeBufferSets() {
-  // Initially define a buffer for every HloValue in the module.
-  for (const HloValue& value : dataflow_analysis_->values()) {
-    HloBuffer& buffer = NewHloBuffer();
-    buffer.AddValue(value);
-    value_to_buffer_[&value] = &buffer;
-  }
-}
-
 Status HloAliasAnalysis::Verify() const {
   // Verify consistency between the value_to_buffer_ map and
   // HloBuffer::values().
@@ -137,9 +353,8 @@ Status HloAliasAnalysis::Verify() const {
                            value) != buffer.values().end());
   }
 
-  for (const auto& pair : buffers_) {
-    const HloBuffer::Id id = pair.first;
-    const HloBuffer& buffer = pair.second;
+  for (HloBuffer::Id id = 0; id < buffers_.size(); ++id) {
+    const HloBuffer& buffer = buffers_[id];
     TF_RET_CHECK(buffer.id() == id);
 
     HloValue::Id last_value_id = -1;
@@ -152,116 +367,9 @@ Status HloAliasAnalysis::Verify() const {
     }
   }
 
-  if (!buffers_vector_.empty()) {
-    // buffers_vector_ should be a vector of all HloBuffers sorted by id.
-    std::vector<const HloBuffer*> buffers;
-    for (const auto& id_buffer : buffers_) {
-      buffers.push_back(&id_buffer.second);
-    }
-    std::sort(buffers.begin(), buffers.end(), HloBuffer::IdLessThan);
-    TF_RET_CHECK(buffers_vector_ == buffers);
-  }
-
   return Status::OK();
 }
 
-Status HloAliasAnalysis::VerifyAgainstReference() const {
-  TF_RETURN_IF_ERROR(Verify());
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> reference,
-                      Run(module_));
-  TF_RETURN_IF_ERROR(reference->Verify());
-
-  VLOG(2) << "This analysis:";
-  XLA_VLOG_LINES(2, ToString());
-  VLOG(2) << "Reference:";
-  XLA_VLOG_LINES(2, reference->ToString());
-
-  // Create map from HloValue in the reference analysis to HloValue in this
-  // analysis and vice versa.
-  tensorflow::gtl::FlatMap<const HloValue*, const HloValue*> reference_to_this;
-  tensorflow::gtl::FlatMap<const HloValue*, const HloValue*> this_to_reference;
-  for (const HloValue& value : dataflow_analysis().values()) {
-    const HloValue& reference_value =
-        reference->dataflow_analysis().GetValueDefinedAt(
-            value.defining_instruction(), value.defining_index());
-    reference_to_this[&reference_value] = &value;
-    this_to_reference[&value] = &reference_value;
-  }
-
-  TF_RET_CHECK(buffers_.size() == reference->buffers_.size())
-      << "Different number of buffers (" << buffers_.size()
-      << " != " << reference->buffers_.size() << ")";
-  for (const auto& pair : reference->buffers_) {
-    const HloBuffer& reference_buffer = pair.second;
-
-    // Find the corresponding buffer in the reference by taking the first value
-    // in the buffer, finding the corresponding value in the reference, and then
-    // finding the buffer holding that value.
-    TF_RET_CHECK(!reference_buffer.values().empty());
-    const HloValue* reference_value = reference_buffer.values()[0];
-    const HloValue* value = reference_to_this.at(reference_value);
-    const HloBuffer& buffer = GetBufferContainingValue(*value);
-
-    // The buffer and the reference should have the exact same values. To make
-    // comparison easy, sort the values in the reference buffer identically to
-    // the values in the non-reference buffer (ie, by the corresponding id of
-    // the non-reference value).
-    std::vector<const HloValue*> reference_values = reference_buffer.values();
-    std::sort(reference_values.begin(), reference_values.end(),
-              [&reference_to_this](const HloValue* a, const HloValue* b) {
-                return reference_to_this.at(a)->id() <
-                       reference_to_this.at(b)->id();
-              });
-    TF_RET_CHECK(reference_values.size() == buffer.values().size());
-    for (int i = 0; i < buffer.values().size(); ++i) {
-      TF_RET_CHECK(*reference_values[i] == *buffer.values()[i])
-          << "Buffer:\n  " << buffer
-          << "\ndoes not have the same values as reference buffer:\n  "
-          << reference_buffer;
-    }
-  }
-
-  return Status::OK();
-}
-
-HloBuffer& HloAliasAnalysis::NewHloBuffer() {
-  HloBuffer::Id buffer_id = next_buffer_id_++;
-  auto emplaced = buffers_.emplace(std::piecewise_construct,
-                                   std::forward_as_tuple(buffer_id),
-                                   std::forward_as_tuple(buffer_id));
-  CHECK(emplaced.second);
-
-  buffers_vector_.clear();
-
-  return emplaced.first->second;
-}
-
-void HloAliasAnalysis::MoveValueToNewBuffer(const HloValue& value) {
-  HloBuffer& new_buffer = NewHloBuffer();
-  MoveValueToBuffer(value, &new_buffer);
-
-  VLOG(3) << "Moved value " << value.ToShortString() << " into new buffer "
-          << new_buffer.id();
-}
-
-void HloAliasAnalysis::MoveValueToBuffer(const HloValue& value,
-                                         HloBuffer* buffer) {
-  HloBuffer& old_buffer = GetBufferContainingValue(value);
-  CHECK_NE(buffer, &old_buffer);
-  VLOG(3) << "Moved value " << value.ToShortString() << " from buffer "
-          << old_buffer.id() << " into buffer " << buffer->id();
-  old_buffer.RemoveValue(value);
-  if (old_buffer.values().empty()) {
-    VLOG(3) << "Buffer " << old_buffer.id() << " now empty. Removing.";
-    buffers_.erase(old_buffer.id());
-    buffers_vector_.clear();
-  }
-
-  buffer->AddValue(value);
-  value_to_buffer_[&value] = buffer;
-}
-
 string HloAliasAnalysis::ToString() const {
   string out = StrCat("HloAliasAnalysis, module ", module_->name(), "\n");
   StrAppend(&out, "  Buffers at each position:\n");
@@ -290,10 +398,10 @@ string HloAliasAnalysis::ToString() const {
   }
 
   StrAppend(&out, "  Buffers:\n");
-  for (const HloBuffer* buffer : buffers()) {
-    StrAppend(&out, "    ", buffer->ToString(), "\n");
+  for (const HloBuffer& buffer : buffers()) {
+    StrAppend(&out, "    ", buffer.ToString(), "\n");
     StrAppend(&out, "      positions:\n");
-    for (const HloPosition& position : buffer->ComputePositions()) {
+    for (const HloPosition& position : buffer.ComputePositions()) {
       StrAppend(&out, "        ", position.ToString(), "\n");
     }
   }
@@ -301,217 +409,6 @@ string HloAliasAnalysis::ToString() const {
   return out;
 }
 
-const std::vector<const HloBuffer*>& HloAliasAnalysis::buffers() const {
-  if (buffers_vector_.empty()) {
-    // Lazily construct vector of buffers.
-    buffers_vector_.reserve(buffers_.size());
-    for (auto& pair : buffers_) {
-      buffers_vector_.push_back(&pair.second);
-    }
-    std::sort(buffers_vector_.begin(), buffers_vector_.end(),
-              HloBuffer::IdLessThan);
-  } else {
-    CHECK_EQ(buffers_vector_.size(), buffers_.size());
-    for (const HloBuffer* buffer : buffers_vector_) {
-      DCHECK(ContainsKey(buffers_, buffer->id()));
-      DCHECK(&GetBuffer(buffer->id()) == buffer);
-    }
-  }
-  return buffers_vector_;
-}
-
-void HloAliasAnalysis::UpdateAtInstructions(
-    tensorflow::gtl::ArraySlice<const HloInstruction*> instructions) {
-  VLOG(4) << "Updated HLO module:";
-  XLA_VLOG_LINES(4, module_->ToString());
-
-  VLOG(3) << "Before update:";
-  XLA_VLOG_LINES(3, ToString());
-
-  std::vector<const HloValue*> values_to_update;
-  for (const HloInstruction* instruction : instructions) {
-    for (auto& pair : dataflow_analysis().GetInstructionValueSet(instruction)) {
-      for (const HloValue* value : pair.second.values()) {
-        values_to_update.push_back(value);
-      }
-    }
-  }
-
-  UpdateBuffersForValues(values_to_update);
-
-  VLOG(3) << "After update:";
-  XLA_VLOG_LINES(3, ToString());
-}
-
-void HloAliasAnalysis::UpdateAfterChangingOperand(HloInstruction* instruction,
-                                                  HloInstruction* old_operand,
-                                                  HloInstruction* new_operand) {
-  VLOG(1) << "UpdateAfterChangingOperand(" << instruction->name() << ", "
-          << old_operand->name() << " => " << new_operand->name() << ")";
-
-  dataflow_analysis_->UpdateAfterChangingOperand(instruction, old_operand,
-                                                 new_operand);
-  TF_DCHECK_OK(dataflow_analysis_->VerifyAgainstReference());
-
-  VLOG(4) << "Updated dataflow:";
-  XLA_VLOG_LINES(4, dataflow_analysis_->ToString());
-
-  UpdateAtInstructions({instruction, old_operand, new_operand});
-}
-
-void HloAliasAnalysis::UpdateAfterChangingRoot(HloInstruction* old_root,
-                                               HloInstruction* new_root) {
-  VLOG(1) << "UpdateAfterChangingRoot(" << old_root->name() << " => "
-          << new_root->name() << ")";
-
-  dataflow_analysis_->UpdateAfterChangingRoot(old_root, new_root);
-  TF_DCHECK_OK(dataflow_analysis_->VerifyAgainstReference());
-
-  VLOG(4) << "Updated dataflow:";
-  XLA_VLOG_LINES(4, dataflow_analysis_->ToString());
-
-  UpdateAtInstructions({old_root, new_root});
-}
-
-std::vector<HloBuffer*> HloAliasAnalysis::ComputeAliasedBuffers(
-    const HloValue& value) {
-  std::vector<HloBuffer*> aliased_buffers;
-
-  // Value is init of a while (use is while).
-  for (const HloUse& use : value.uses()) {
-    VLOG(1) << "use of value " << value.ToShortString() << ": " << use;
-    if (use.instruction->opcode() == HloOpcode::kWhile) {
-      // Determine the while value that this shares a buffer with.
-      const HloValue& while_value = dataflow_analysis().GetUniqueValueAt(
-          use.instruction, use.operand_index);
-      aliased_buffers.push_back(&GetBufferContainingValue(while_value));
-      VLOG(3) << "  value is init value to a while; must share buffer with "
-                 "while value "
-              << while_value.ToShortString();
-    }
-  }
-
-  // Value is a parameter of a while body/condition.
-  if (value.defining_instruction()->opcode() == HloOpcode::kParameter) {
-    const HloComputation* computation = value.defining_instruction()->parent();
-    const CallGraphNode& call_graph_node =
-        dataflow_analysis().call_graph().GetNode(computation);
-    for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-      if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
-        // Call graph must have been flattened.
-        CHECK_EQ(call_graph_node.caller_callsites().size(), 1);
-
-        const HloValue& while_value = dataflow_analysis().GetUniqueValueAt(
-            callsite.instruction(), value.defining_index());
-        VLOG(3) << "  value is parameter value of the body or condition of a "
-                   "while; must share buffer with while value "
-                << while_value.ToShortString();
-        aliased_buffers.push_back(&GetBufferContainingValue(while_value));
-      }
-    }
-  }
-
-  // Value is the root of a while body.
-  for (const HloPosition& position : value.positions()) {
-    const HloComputation* computation = position.instruction->parent();
-    const CallGraphNode& call_graph_node =
-        dataflow_analysis().call_graph().GetNode(computation);
-    if (position.instruction == computation->root_instruction()) {
-      for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-        if (callsite.instruction()->opcode() == HloOpcode::kWhile &&
-            callsite.instruction()->while_body() == computation) {
-          // Call graph must have been flattened.
-          CHECK_EQ(call_graph_node.caller_callsites().size(), 1);
-
-          // If the value appears in the root of a while body, then
-          // necessarily the value is defined in the body as well.
-          CHECK_EQ(value.defining_instruction()->parent(), computation);
-
-          const HloValue& while_value = dataflow_analysis().GetUniqueValueAt(
-              callsite.instruction(), position.index);
-          VLOG(3) << "  value is root the body computation of a while; must "
-                     "share buffer with while value "
-                  << while_value.ToShortString();
-          aliased_buffers.push_back(&GetBufferContainingValue(while_value));
-        }
-      }
-    }
-  }
-
-  // Value is in the while instruction itself.
-  if (value.defining_instruction()->opcode() == HloOpcode::kWhile) {
-    VLOG(3) << "  value is output of a while instruction";
-    aliased_buffers.push_back(&GetUniqueBufferAt(value.defining_instruction(),
-                                                 value.defining_index()));
-  }
-
-  // Uniquify aliased buffers.
-  std::sort(aliased_buffers.begin(), aliased_buffers.end(),
-            HloBuffer::IdLessThan);
-  aliased_buffers.erase(
-      std::unique(aliased_buffers.begin(), aliased_buffers.end()),
-      aliased_buffers.end());
-
-  return aliased_buffers;
-}
-
-// This method recomputes the HloBuffer for each of the given HloValues. The
-// method does not necessarily update the HloBuffer of values which share a
-// buffer with the given values, but are not explicitly passed in
-// 'values'. Therefore, the caller must pass in all values which may require an
-// update according to the kind of HLO graph change which occurred: operand
-// changed (UpdateAfterChangingOperand), or root of computation changed
-// (UpdateAfterChangingRoot).
-void HloAliasAnalysis::UpdateBuffersForValues(
-    tensorflow::gtl::ArraySlice<const HloValue*> values) {
-  for (const HloValue* value : values) {
-    VLOG(3) << "Updating buffer for value: " << value->ToShortString();
-
-    // Gather the set of buffer with aliasing rules (eg, kWhile) which this
-    // value must be contained in due.
-    std::vector<HloBuffer*> aliased_buffers = ComputeAliasedBuffers(*value);
-
-    HloBuffer& current_buffer = GetBufferContainingValue(*value);
-    if (aliased_buffers.empty()) {
-      // The buffer containing 'value' aliases no other buffers. If the buffer
-      // containing 'value' already only contains 'value', then no change is
-      // necessary. If the buffer containing 'value' does contain other values,
-      // then remove 'value' from the buffer and create a new buffer containing
-      // only 'value'
-      if (current_buffer.values().size() == 1) {
-        CHECK_EQ(current_buffer.values()[0], value);
-      } else {
-        MoveValueToNewBuffer(*value);
-      }
-    } else {
-      // If multiple buffers are aliased merge these buffers together into a
-      // single buffer (arbitrarily chosen as the first buffer in the vector).
-      if (aliased_buffers.size() > 1) {
-        for (int64 i = 1; i < aliased_buffers.size(); ++i) {
-          // Make copy of values vector because MoveValueToBuffer invalidates
-          // the values iterator. The could be done more efficiently by moving
-          // all values and once.
-          std::vector<const HloValue*> values = aliased_buffers[i]->values();
-          for (const HloValue* value : values) {
-            MoveValueToBuffer(*value, aliased_buffers[0]);
-          }
-        }
-        aliased_buffers.resize(1);
-      }
-
-      CHECK_EQ(aliased_buffers.size(), 1);
-      HloBuffer* new_buffer = aliased_buffers[0];
-
-      if (&current_buffer != new_buffer) {
-        MoveValueToBuffer(*value, new_buffer);
-      }
-    }
-
-    VLOG(4) << "Analysis after update:";
-    XLA_VLOG_LINES(4, ToString());
-  }
-}
-
 /* static */
 StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
     HloModule* module) {
@@ -524,18 +421,28 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
       HloDataflowAnalysis::Run(module, /*ssa_form=*/true,
                                /*bitcast_defines_value=*/false));
 
-  alias_analysis->InitializeBufferSets();
+  BufferValueMap buffer_map(alias_analysis->dataflow_analysis());
+  buffer_map.MergeAliasedBuffers();
 
-  VLOG(3) << "After initialization:";
-  XLA_VLOG_LINES(3, alias_analysis->ToString());
-
-  std::vector<const HloValue*> all_values;
-  for (const HloValue& value : alias_analysis->dataflow_analysis().values()) {
-    all_values.push_back(&value);
+  // Create a vector of HloBuffers, one for each set of values in the
+  // BufferValueMap. Create the HloBuffers as a vector of contiguously numbered
+  // buffers.
+  std::vector<BufferValueMap::BufferNumber> sorted_buffer_numbers =
+      buffer_map.ComputeSortedBufferNumbers();
+  alias_analysis->buffers_.reserve(sorted_buffer_numbers.size());
+  HloBuffer::Id next_id = 0;
+  for (BufferValueMap::BufferNumber buffer_number : sorted_buffer_numbers) {
+    auto& value_set = buffer_map.GetValuesInBuffer(buffer_number);
+    std::vector<const HloValue*> sorted_values(value_set.begin(),
+                                               value_set.end());
+    std::sort(sorted_values.begin(), sorted_values.end(), HloValue::IdLessThan);
+    alias_analysis->buffers_.emplace_back(next_id++, sorted_values);
+    for (const HloValue* value : sorted_values) {
+      alias_analysis->value_to_buffer_[value] =
+          &alias_analysis->buffers_.back();
+    }
   }
 
-  alias_analysis->UpdateBuffersForValues(all_values);
-
   TF_DCHECK_OK(alias_analysis->Verify());
 
   XLA_VLOG_LINES(1, alias_analysis->ToString());
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index 1b538f6d1cf..39554e46648 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -74,7 +74,7 @@ class HloAliasAnalysis {
   // Return a vector of all HloBuffers stabily sorted by HloBuffer::Id. This
   // vector is lazily computed. Mutating operations on HloAliasAnalysis may
   // invalidate the underlying vector requiring recomputation.
-  const std::vector<const HloBuffer*>& buffers() const;
+  const std::vector<HloBuffer>& buffers() const { return buffers_; }
 
   // Returns the underlying dataflow analysis used by this alias analysis.
   const HloDataflowAnalysis& dataflow_analysis() const {
@@ -90,50 +90,13 @@ class HloAliasAnalysis {
   // output of the given instruction.
   bool InstructionBuffersAreDistinct(const HloInstruction* instruction) const;
 
-  // Updates the analysis after the operands of 'instruction' have changed or if
-  // 'instruction' has been made the root of a computation. Analysis update is
-  // not possible if instructions have been added or removed from the graph.
-  void UpdateAfterChangingOperand(HloInstruction* instruction,
-                                  HloInstruction* old_operand,
-                                  HloInstruction* new_operand);
-  void UpdateAfterChangingRoot(HloInstruction* old_root,
-                               HloInstruction* new_root);
-
   // Compare the dataflow analysis against a clean recomputation of the
   // analysis. Returns an error status if there is a mismatch. Useful for
   // verifying the correctness after updates to the analysis.
   Status VerifyAgainstReference() const;
 
  protected:
-  HloAliasAnalysis(HloModule* module);
-
-  // Create a new empty HloBuffer.
-  HloBuffer& NewHloBuffer();
-
-  // Move the given value to the given buffer. The value is removed from it's
-  // current buffer.
-  void MoveValueToBuffer(const HloValue& value, HloBuffer* buffer);
-
-  // Move the given value to a newly created buffer. The value is removed from
-  // it's current buffer.
-  void MoveValueToNewBuffer(const HloValue& value);
-
-  // Construct the initial set of buffer sets where an HloBuffer is created for
-  // each HloValue in the module.
-  void InitializeBufferSets();
-
-  // Compute and return the buffers with aliasing rules (eg, kWhile) which the
-  // given value must be contained in.
-  std::vector<HloBuffer*> ComputeAliasedBuffers(const HloValue& value);
-
-  // Recompute the HloBuffers for the given values.
-  void UpdateBuffersForValues(
-      tensorflow::gtl::ArraySlice<const HloValue*> values);
-
-  // Recompute the HloBuffers for all the values which appear in the output of
-  // the given instructions.
-  void UpdateAtInstructions(
-      tensorflow::gtl::ArraySlice<const HloInstruction*> instructions);
+  explicit HloAliasAnalysis(HloModule* module);
 
   // Verify various invariants of the alias analysis.
   Status Verify() const;
@@ -143,20 +106,12 @@ class HloAliasAnalysis {
   // The underlying dataflow analysis used by this alias analysis.
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
 
-  // The map of all HloBuffers in the module. We pass around pointers to the
-  // mapped HloBuffers, so the underlying container must keep them valid despite
-  // mutations touching other map entries.
-  std::unordered_map<HloBuffer::Id, HloBuffer> buffers_;
-
   // A map indicating which buffer a value is contained in.
   tensorflow::gtl::FlatMap<const HloValue*, HloBuffer*> value_to_buffer_;
 
   // A lazily constructed vector containing all HloBuffers sorted by
   // HloBuffer::Id.
-  mutable std::vector<const HloBuffer*> buffers_vector_;
-
-  // The Id to use for the next HloBuffer.
-  int64 next_buffer_id_ = 0;
+  std::vector<HloBuffer> buffers_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index e2815d6e648..6e311e25fb9 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -87,14 +87,13 @@ class HloAliasAnalysisTest : public HloTestBase {
   // constructed.
   bool AnyValuesInSameBufferInterfere() {
     DependencyHloOrdering ordering(module_.get());
-    for (const HloBuffer* buffer : analysis_->buffers()) {
-      for (const HloValue* value_a : buffer->values()) {
-        for (const HloValue* value_b : buffer->values()) {
+    for (const HloBuffer& buffer : analysis_->buffers()) {
+      for (const HloValue* value_a : buffer.values()) {
+        for (const HloValue* value_b : buffer.values()) {
           if (*value_a != *value_b &&
-              analysis_->dataflow_analysis().MayInterfere(*value_a, *value_b,
-                                                          ordering)) {
+              ordering.MayInterfere(*value_a, *value_b)) {
             VLOG(1) << *value_a << " interferes with " << *value_b
-                    << " in buffer: " << *buffer;
+                    << " in buffer: " << buffer;
             return true;
           }
         }
@@ -384,10 +383,7 @@ TEST_F(HloAliasAnalysisTest, SingleWhile) {
 
   EXPECT_THAT(
       GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{0})),
-      UnorderedElementsAre(GetValueDefinedAt(xla_while, /*index=*/{0}),
-                           GetValueDefinedAt(body_param, /*index=*/{0}),
-                           GetValueDefinedAt(cond_param, /*index=*/{0}),
-                           GetValueDefinedAt(constant1)));
+      UnorderedElementsAre(GetValueDefinedAt(constant1)));
   EXPECT_THAT(
       GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1})),
       UnorderedElementsAre(GetValueDefinedAt(constant2),
@@ -631,9 +627,9 @@ TEST_F(HloAliasAnalysisTest, SwizzlingWhile) {
   // HloBuffers.
   EXPECT_THAT(
       analysis.buffers(),
-      UnorderedElementsAre(&analysis.GetUniqueBufferAt(constant1),
-                           &analysis.GetUniqueBufferAt(tuple, /*index=*/{}),
-                           &analysis.GetUniqueBufferAt(cond_constant)));
+      UnorderedElementsAre(analysis.GetUniqueBufferAt(constant1),
+                           analysis.GetUniqueBufferAt(tuple, /*index=*/{}),
+                           analysis.GetUniqueBufferAt(cond_constant)));
 
   // The tuple elements of the while and the three constant inputs should all be
   // smooshed into the same buffer.
@@ -820,127 +816,5 @@ TEST_F(HloAliasAnalysisTest, Bitcast) {
             analysis.GetUniqueBufferAt(bitcast));
 }
 
-TEST_F(HloAliasAnalysisTest, UpdateAnalysisForWhile) {
-  // Test updating alias analysis after modifying a module with an array shaped
-  // while:
-  //
-  // body(F32[]  %param):
-  //   %negate = Negate(%param)
-  //
-  // condition(F32[] %param):
-  //   return Constant(false)
-  //
-  // entry:
-  //   %constant = Constant(1.0)
-  //   %exp = Exp(%constant)
-  //   return While(%exp, body, condition)
-  //
-  auto body_builder = HloComputation::Builder("body");
-  auto body_param = body_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
-  auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
-      scalar_shape_, HloOpcode::kNegate, body_param));
-  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
-
-  // Condition computation trivially returns a constant "false".
-  auto cond_builder = HloComputation::Builder("condition");
-  auto cond_param = cond_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
-  cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
-  HloComputation* condition =
-      module_->AddEmbeddedComputation(cond_builder.Build());
-
-  auto builder = HloComputation::Builder(TestName());
-  auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kExp, constant));
-  auto xla_while = builder.AddInstruction(
-      HloInstruction::CreateWhile(scalar_shape_, condition, body, exp));
-  module_->AddEntryComputation(builder.Build());
-
-  HloAliasAnalysis& analysis = RunAnalysis();
-
-  // Sanity check some alias information.
-  EXPECT_EQ(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(body_param));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(cond_param));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(negate));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(xla_while));
-
-  // Set the body root to the body_param. Previously it was Negate(body_param).
-  body->set_root_instruction(body_param);
-
-  // Prior to updating, verify that the analysis is no longer valid.
-  Status verify_status = analysis.VerifyAgainstReference();
-  EXPECT_FALSE(verify_status.ok());
-
-  analysis.UpdateAfterChangingRoot(/*old_root=*/negate,
-                                   /*new_root*/ body_param);
-
-  // Analysis should be valid after the update.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-
-  // The exponential should now pass through the body transparently.
-  EXPECT_EQ(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(body_param));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(cond_param));
-  EXPECT_NE(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(negate));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(exp),
-            analysis.GetUniqueBufferAt(xla_while));
-
-  // Now replace the operand of the while with %constant (was %exp).
-  TF_ASSERT_OK(exp->ReplaceUseWith(xla_while, constant));
-  analysis.UpdateAfterChangingOperand(xla_while, /*old_operand=*/exp,
-                                      /*new_operand=*/constant);
-
-  // Analysis should be valid after the update.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-
-  EXPECT_EQ(analysis.GetUniqueBufferAt(constant),
-            analysis.GetUniqueBufferAt(body_param));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(constant),
-            analysis.GetUniqueBufferAt(cond_param));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(constant),
-            analysis.GetUniqueBufferAt(xla_while));
-  EXPECT_NE(analysis.GetUniqueBufferAt(constant),
-            analysis.GetUniqueBufferAt(exp));
-  EXPECT_NE(analysis.GetUniqueBufferAt(constant),
-            analysis.GetUniqueBufferAt(negate));
-
-  // And finally make the negate the root of the body again.
-  body->set_root_instruction(negate);
-  analysis.UpdateAfterChangingRoot(/*old_root=*/body_param,
-                                   /*new_root*/ negate);
-
-  // Analysis should be valid after the update.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-
-  EXPECT_EQ(analysis.GetUniqueBufferAt(negate),
-            analysis.GetUniqueBufferAt(body_param));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(negate),
-            analysis.GetUniqueBufferAt(cond_param));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(negate),
-            analysis.GetUniqueBufferAt(xla_while));
-  EXPECT_EQ(analysis.GetUniqueBufferAt(constant),
-            analysis.GetUniqueBufferAt(negate));
-
-  auto value_of = [&analysis](const HloInstruction* instruction) {
-    return &analysis.dataflow_analysis().GetValueDefinedAt(instruction);
-  };
-  EXPECT_THAT(analysis.GetUniqueBufferAt(negate).values(),
-              UnorderedElementsAre(value_of(body_param), value_of(cond_param),
-                                   value_of(negate), value_of(constant),
-                                   value_of(xla_while)));
-}
-
-// Test update tuple element.
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
index 2bfdd9156ad..e16413f361f 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.cc
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -36,22 +36,6 @@ namespace xla {
 using ::tensorflow::str_util::Join;
 using ::tensorflow::strings::StrCat;
 
-void HloBuffer::AddValue(const HloValue& value) {
-  values_.push_back(&value);
-  // Sort vector and remove duplicates.
-  std::sort(values_.begin(), values_.end(), HloValue::IdLessThan);
-  values_.erase(std::unique(values_.begin(), values_.end(), HloValue::IdEqual),
-                values_.end());
-}
-
-void HloBuffer::RemoveValue(const HloValue& value) {
-  // The values are sorted, so finding the value could be done in log(n) time
-  // with a binary search.
-  auto it = std::find(values_.begin(), values_.end(), &value);
-  CHECK(it != values_.end());
-  values_.erase(it);
-}
-
 bool HloBuffer::operator==(const HloBuffer& other) const {
   bool equal = id() == other.id();
   if (equal) {
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.h b/tensorflow/compiler/xla/service/hlo_buffer.h
index cb961e1601c..4873463b2ea 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.h
+++ b/tensorflow/compiler/xla/service/hlo_buffer.h
@@ -84,22 +84,15 @@ class HloBuffer {
     return a->id() == b->id();
   }
 
-  HloBuffer(Id id) : id_(id) {}
+  HloBuffer(Id id, tensorflow::gtl::ArraySlice<const HloValue*> values)
+      : id_(id), values_(values.begin(), values.end()) {}
 
   // Return the unique identifier for this HloBuffer.
   Id id() const { return id_; }
 
-  // Add a value to the set of values held by this buffer. Also adds the
-  // HloPositions of the value to the positions vector of the buffer. If the
-  // buffer already contains this value, then this method is a nop.
-  void AddValue(const HloValue& value);
-  void RemoveValue(const HloValue& value);
-
   // Return all values contained in this buffer.
   const std::vector<const HloValue*>& values() const { return values_; }
 
-  std::vector<HloPosition> ComputePositions() const;
-
   // Return the unique HLO value in the buffer. CHECK fails if the buffer does
   // not contain exactly one value.
   const HloValue& GetUniqueValue() const {
@@ -107,6 +100,8 @@ class HloBuffer {
     return *values_[0];
   }
 
+  std::vector<HloPosition> ComputePositions() const;
+
   string ToString() const;
 
   bool operator==(const HloBuffer& other) const;
@@ -118,7 +113,7 @@ class HloBuffer {
 
   // The set of values contained in this buffer. Vector contains no duplicates
   // and is sorted stably by HloValue::Id.
-  std::vector<const HloValue*> values_;
+  const std::vector<const HloValue*> values_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloBuffer& buffer);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 9dbde0ec243..f6b764732b4 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -118,13 +118,11 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
   }
 }
 
-Status HloCostAnalysis::HandleElementwiseUnary(HloInstruction* hlo,
-                                               HloOpcode opcode) {
+Status HloCostAnalysis::HandleElementwiseUnary(HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleElementwiseBinary(HloInstruction* hlo,
-                                                HloOpcode opcode) {
+Status HloCostAnalysis::HandleElementwiseBinary(HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 6d8fdfa64b5..eeb3d4edd1b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -49,9 +49,8 @@ class HloCostAnalysis : public DfsHloVisitor {
   using ShapeSizeFunction = std::function<int64(const Shape&)>;
   explicit HloCostAnalysis(const ShapeSizeFunction& shape_size);
 
-  Status HandleElementwiseUnary(HloInstruction* hlo, HloOpcode opcode) override;
-  Status HandleElementwiseBinary(HloInstruction* hlo,
-                                 HloOpcode opcode) override;
+  Status HandleElementwiseUnary(HloInstruction* hlo) override;
+  Status HandleElementwiseBinary(HloInstruction* hlo) override;
   Status HandleConstant(HloInstruction* constant,
                         const Literal& literal) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element,
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index ea8b239e100..2be1645f1b0 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -67,6 +67,22 @@ HloValue& HloDataflowAnalysis::GetValueDefinedAt(
   return GetUniqueValueAt(instruction, index);
 }
 
+HloValue* HloDataflowAnalysis::NewHloValue(HloInstruction* instruction,
+                                           const ShapeIndex& index,
+                                           bool is_phi) {
+  const int64 value_id = next_value_id_++;
+  auto emplaced = values_.emplace(
+      std::piecewise_construct, std::forward_as_tuple(value_id),
+      std::forward_as_tuple(value_id, instruction, index, is_phi));
+  CHECK(emplaced.second);
+
+  return &emplaced.first->second;
+}
+
+void HloDataflowAnalysis::DeleteHloValue(HloValue::Id value_id) {
+  values_.erase(value_id);
+}
+
 string HloDataflowAnalysis::ToString() const {
   string out = StrCat("HloDataflowAnalysis, module ", module_->name(), "\n");
   StrAppend(&out, "  Instruction value sets:\n");
@@ -99,22 +115,98 @@ string HloDataflowAnalysis::ToString() const {
     }
   }
   StrAppend(&out, "  HloValues:\n");
-  for (const HloValue& value : values()) {
-    StrAppend(&out, value.ToString(/*indent=*/4));
-  }
-  StrAppend(&out, "  Phi resolutions:\n");
-  for (const HloValue& value : values()) {
-    if (value.is_phi()) {
-      const HloValue* resolved_value = ResolvePhi(value);
-      StrAppend(&out, "    ", value.ToShortString(), " => ",
-                resolved_value == nullptr ? "UNKNOWN"
-                                          : resolved_value->ToShortString(),
-                "\n");
-    }
+  for (const HloValue* value : values()) {
+    StrAppend(&out, value->ToString(/*indent=*/4));
   }
   return out;
 }
 
+bool HloDataflowAnalysis::Phi(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs) {
+  CHECK(ssa_form_);
+
+  for (const InstructionValueSet* input : inputs) {
+    DCHECK(ShapeUtil::Compatible(instruction->shape(), input->shape()));
+  }
+
+  bool changed = false;
+  for (auto& pair : GetInstructionValueSet(instruction)) {
+    const ShapeIndex& index = pair.first;
+    HloValueSet& value_set = pair.second;
+
+    // Positions with phi values should never have more than one value in the
+    // value set.
+    CHECK_LE(value_set.values().size(), 1);
+    const HloValue* current_value =
+        value_set.values().size() == 1 ? value_set.values()[0] : nullptr;
+
+    // Construct a vector of unique value IDs of the inputs.
+    std::vector<HloValue::Id> input_value_ids;
+    for (const InstructionValueSet* input : inputs) {
+      for (const HloValue* value : input->element(index).values()) {
+        input_value_ids.push_back(value->id());
+      }
+    }
+    std::sort(input_value_ids.begin(), input_value_ids.end());
+    input_value_ids.erase(
+        std::unique(input_value_ids.begin(), input_value_ids.end()),
+        input_value_ids.end());
+
+    // Remove the existing phi value (if it exists). The phi can be its own
+    // input, for example, in while body parameters where the body passes
+    // through the parameter value.
+    bool current_value_defined_here =
+        (current_value != nullptr &&
+         current_value->defining_instruction() == instruction &&
+         current_value->defining_index() == index);
+    if (current_value_defined_here) {
+      CHECK(current_value->is_phi());
+      auto it = std::find(input_value_ids.begin(), input_value_ids.end(),
+                          current_value->id());
+      if (it != input_value_ids.end()) {
+        input_value_ids.erase(it);
+      }
+    }
+
+    if (input_value_ids.empty()) {
+      // A value set which has at least one element should never have its value
+      // set reduced to zero elements. During dataflow value sets only can go
+      // from empty to non-empty, not the reverse.
+      CHECK_EQ(value_set.values().size(), 0)
+          << "Instruction " << instruction->name() << " at index " << index
+          << " previously had non-empty value set. Value set: " << value_set;
+    } else if (input_value_ids.size() == 1) {
+      // Only a single value reaches this point. There should be no phi, and
+      // this value set should contain this single value.
+      const HloValue& new_value = GetValue(input_value_ids[0]);
+      if (current_value == nullptr) {
+        value_set.Clear();
+        value_set.AddValue(&new_value);
+        changed = true;
+      } else if (current_value != &new_value) {
+        if (current_value_defined_here) {
+          // Remove the existing phi.
+          DeleteHloValue(current_value->id());
+        }
+        value_set.Clear();
+        value_set.AddValue(&new_value);
+        changed = true;
+      }
+    } else {
+      // Multiple distinct values reach this point. A phi value is
+      // necessary.
+      CHECK_GT(input_value_ids.size(), 1);
+      if (current_value == nullptr || !current_value->is_phi()) {
+        value_set.Clear();
+        value_set.AddValue(NewHloValue(instruction, index, /*is_phi=*/true));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
 const HloValue& HloDataflowAnalysis::GetValue(HloValue::Id value_id) const {
   return values_.at(value_id);
 }
@@ -142,129 +234,6 @@ HloValueSet& HloDataflowAnalysis::GetValueSet(const HloPosition& position) {
   return GetValueSet(position.instruction, position.index);
 }
 
-void HloDataflowAnalysis::UpdateAfterChangingOperand(
-    HloInstruction* instruction, HloInstruction* old_operand,
-    HloInstruction* new_operand) {
-  CHECK(std::find(instruction->operands().begin(),
-                  instruction->operands().end(),
-                  new_operand) != instruction->operands().end());
-  VLOG(1) << "UpdateAfterChangingOperand(" << instruction->name() << ", "
-          << old_operand->name() << " => " << new_operand->name() << ")";
-
-  std::vector<HloInstruction*> to_update = {instruction};
-
-  // If the instruction calls any computations then add the parameters of called
-  // computation to capture any changes to the dataflow into the subcomputation
-  // introduced by the new operand.
-  for (HloComputation* computation : instruction->called_computations()) {
-    to_update.insert(to_update.end(),
-                     computation->parameter_instructions().begin(),
-                     computation->parameter_instructions().end());
-  }
-
-  UpdateInstructionsAndPropagate(to_update);
-
-  // The uses of the values in the old and new operand may have changed. Uses of
-  // other HloValues are updated in UpdateInstructionsAndPropagate.
-  for (auto& pair : GetInstructionValueSet(old_operand)) {
-    for (const HloValue* value : pair.second.values()) {
-      GetValue(value->id()).RecomputeUses();
-    }
-  }
-  for (auto& pair : GetInstructionValueSet(new_operand)) {
-    for (const HloValue* value : pair.second.values()) {
-      GetValue(value->id()).RecomputeUses();
-    }
-  }
-
-  TF_DCHECK_OK(VerifyAgainstReference());
-}
-
-void HloDataflowAnalysis::UpdateAfterChangingRoot(HloInstruction* old_root,
-                                                  HloInstruction* new_root) {
-  VLOG(1) << "UpdateAfterChangingRoot(" << old_root->name() << " => "
-          << new_root->name() << ")";
-
-  CHECK_EQ(new_root, new_root->parent()->root_instruction());
-  CHECK_EQ(new_root->parent(), old_root->parent());
-
-  std::vector<HloInstruction*> to_update = {old_root, new_root};
-
-  const CallGraphNode& call_graph_node =
-      call_graph_->GetNode(new_root->parent());
-  for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-    if (callsite.instruction()->opcode() == HloOpcode::kCall) {
-      to_update.push_back(callsite.instruction());
-    } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
-      // Add the while itself, and the body and condition parameters.
-      to_update.push_back(callsite.instruction());
-      to_update.push_back(
-          callsite.instruction()->while_body()->parameter_instruction(0));
-      to_update.push_back(
-          callsite.instruction()->while_condition()->parameter_instruction(0));
-    }
-  }
-
-  UpdateInstructionsAndPropagate(to_update);
-
-  TF_DCHECK_OK(VerifyAgainstReference());
-}
-
-const HloValue* HloDataflowAnalysis::ResolvePhi(const HloValue& phi) const {
-  CHECK(phi.is_phi());
-
-  tensorflow::gtl::FlatSet<const HloValue*> visited;
-  std::queue<const HloValue*> worklist;
-  auto add_to_worklist = [&worklist, &visited](const HloValue* v) {
-    if (visited.insert(v).second) {
-      // 'v' was not previously in visited.
-      worklist.push(v);
-    }
-  };
-  add_to_worklist(&phi);
-
-  const HloValue* resolved_value = nullptr;
-  while (!worklist.empty()) {
-    const HloValue* value = worklist.front();
-    worklist.pop();
-
-    if (!value->is_phi()) {
-      if (resolved_value == nullptr) {
-        resolved_value = value;
-      } else if (resolved_value != value) {
-        return nullptr;
-      }
-    } else {
-      for (const HloValue* input : phi_inputs_.at(value)) {
-        add_to_worklist(input);
-      }
-    }
-  }
-  return resolved_value;
-}
-
-void HloDataflowAnalysis::UpdatePhiInputs(
-    const HloInstruction* instruction,
-    tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs) {
-  CHECK(ssa_form_);
-  for (auto& pair : GetInstructionValueSet(instruction)) {
-    const ShapeIndex& index = pair.first;
-    const HloValue& phi_value = GetUniqueValueAt(instruction, index);
-    auto& phi_inputs = phi_inputs_.at(&phi_value);
-    phi_inputs.clear();
-    for (const InstructionValueSet* input : inputs) {
-      for (const HloValue* value : input->element(index).values()) {
-        // The number of phi inputs is typically 2, and virtually always very
-        // small.
-        if (std::find(phi_inputs.begin(), phi_inputs.end(), value) ==
-            phi_inputs.end()) {
-          phi_inputs.push_back(value);
-        }
-      }
-    }
-  }
-}
-
 bool HloDataflowAnalysis::UpdateBitcastValueSet(HloInstruction* bitcast) {
   CHECK_EQ(bitcast->opcode(), HloOpcode::kBitcast);
   const InstructionValueSet& operand_set =
@@ -380,8 +349,7 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
   }
 
   if (ssa_form_ && called_from_while) {
-    UpdatePhiInputs(parameter, inputs);
-    return false;
+    return Phi(parameter, inputs);
   } else {
     return GetInstructionValueSet(parameter).AssignUnionOf(inputs);
   }
@@ -439,8 +407,7 @@ bool HloDataflowAnalysis::UpdateWhileValueSet(HloInstruction* xla_while) {
       &GetInstructionValueSet(xla_while->while_body()->root_instruction()),
       &GetInstructionValueSet(xla_while->operand(0))};
   if (ssa_form_) {
-    UpdatePhiInputs(xla_while, inputs);
-    return false;
+    return Phi(xla_while, inputs);
   } else {
     return GetInstructionValueSet(xla_while).AssignUnionOf(inputs);
   }
@@ -487,38 +454,7 @@ void HloDataflowAnalysis::UpdateInstructionsAndPropagate(
     VLOG(3) << "Worklist top: " << instruction->name();
     VLOG(3) << ToString();
 
-    // The updating of the instruction value set below in
-    // UpdateInstructionValueSet does not update HloValue::positions(). To
-    // perform the positions() update remove all positions in 'instruction' from
-    // the HloValues in 'instruction's value set prior to the update, then after
-    // the update add the new positions back in. There is likely a more
-    // efficient way of doing this.
-    for (auto& pair : GetInstructionValueSet(instruction)) {
-      const ShapeIndex& index = pair.first;
-      HloValueSet& value_set = pair.second;
-      for (const HloValue* value : value_set.values()) {
-        if (value->defining_instruction() != instruction) {
-          // Use GetValue for a non-const HloValue reference.
-          GetValue(value->id()).RemovePosition(instruction, index);
-        }
-      }
-    }
-
-    bool changed = UpdateInstructionValueSet(instruction);
-
-    // Add the positions back in.
-    for (auto& pair : GetInstructionValueSet(instruction)) {
-      const ShapeIndex& index = pair.first;
-      HloValueSet& value_set = pair.second;
-      for (const HloValue* value : value_set.values()) {
-        if (value->defining_instruction() != instruction) {
-          // Use GetValue for a non-const HloValue reference.
-          GetValue(value->id()).AddPosition(instruction, index);
-        }
-      }
-    }
-
-    if (!changed) {
+    if (!UpdateInstructionValueSet(instruction)) {
       // No change to the instruction's value set.
       VLOG(4) << "No change.";
       continue;
@@ -531,12 +467,16 @@ void HloDataflowAnalysis::UpdateInstructionsAndPropagate(
     for (HloInstruction* user : instruction->users()) {
       worklist.push(user);
 
-      // If user calls a computation, then the respective parameter(s) of the
-      // computation need to be updated.
+      // If user sequentially calls a computation, then the respective
+      // parameter(s) of the computation need to be updated.
       for (HloComputation* called_computation : user->called_computations()) {
-        for (int64 operand_number : user->OperandIndices(instruction)) {
-          worklist.push(
-              called_computation->parameter_instruction(operand_number));
+        const CallGraphNode& call_graph_node =
+            call_graph_->GetNode(called_computation);
+        if (call_graph_node.context() == CallContext::kSequential) {
+          for (int64 operand_number : user->OperandIndices(instruction)) {
+            worklist.push(
+                called_computation->parameter_instruction(operand_number));
+          }
         }
       }
     }
@@ -574,25 +514,10 @@ InstructionValueSet& HloDataflowAnalysis::GetInstructionValueSet(
 }
 
 Status HloDataflowAnalysis::InitializeInstructionValueSets() {
-  // Gather the values to create before creating them. This is done because we
-  // want to allocate the vector of values only once so references to elements
-  // are stable.
-  struct ValueToCreate {
-    HloInstruction* instruction;
-    ShapeIndex index;
-    bool is_phi;
-  };
-  std::vector<ValueToCreate> values_to_create;
-
   for (const std::unique_ptr<HloComputation>& computation :
        module_->computations()) {
     const CallGraphNode& call_graph_node =
         call_graph_->GetNode(computation.get());
-    bool called_from_while = std::any_of(
-        call_graph_node.caller_callsites().begin(),
-        call_graph_node.caller_callsites().end(), [](const CallSite& cs) {
-          return cs.instruction()->opcode() == HloOpcode::kWhile;
-        });
 
     for (const std::unique_ptr<HloInstruction>& instruction :
          computation->instructions()) {
@@ -603,20 +528,22 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
 
       // Lambda to set the value set to define all values in the output of the
       // instruction.
-      auto define_all_values = [this, &instruction,
-                                &values_to_create](bool is_phi = false) {
+      auto define_all_values = [this, &instruction](bool is_phi = false) {
         for (auto& pair : GetInstructionValueSet(instruction.get())) {
           const ShapeIndex& index = pair.first;
-          values_to_create.push_back({instruction.get(), index, is_phi});
+          HloValue* value =
+              NewHloValue(instruction.get(), index, /*is_phi=*/false);
+          GetValueSet(instruction.get(), index).AddValue(value);
         }
       };
 
       // Lambda to set the value set to define only the top-level buffer in the
       // output of the instruction. Any other values flow from the operands of
       // the instruction (or from cross-computation dataflow).
-      auto define_top_level_only = [this, &instruction, &values_to_create]() {
-        values_to_create.push_back(
-            {instruction.get(), /*index=*/{}, /*is_phi=*/false});
+      auto define_top_level_only = [this, &instruction]() {
+        HloValue* value =
+            NewHloValue(instruction.get(), /*index=*/{}, /*is_phi=*/false);
+        GetValueSet(instruction.get(), /*index=*/{}).AddValue(value);
       };
 
       switch (instruction->opcode()) {
@@ -626,10 +553,6 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           }
           break;
         case HloOpcode::kWhile:
-          if (ssa_form_) {
-            define_all_values(/*is_phi=*/true);
-          }
-          break;
         case HloOpcode::kCall:
         case HloOpcode::kGetTupleElement:
           // These instructions define no values. The values in their output
@@ -654,10 +577,6 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
             // values in their output. Otherwise the values of the parameter
             // come from the caller (eg, operands to the kCall instruction).
             define_all_values();
-          } else if (call_graph_node.context() == CallContext::kSequential &&
-                     called_from_while && ssa_form_) {
-            // Parameters of while bodies and conditions are phis.
-            define_all_values(/*is_phi=*/true);
           }
           break;
         case HloOpcode::kCopy:
@@ -674,164 +593,9 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
     }
   }
 
-  // Reserve the vector ahead of time so references to elements are stable.
-  values_.reserve(values_to_create.size());
-  for (int64 i = 0; i < values_to_create.size(); ++i) {
-    const ValueToCreate& to_create = values_to_create[i];
-    values_.emplace_back(/*id=*/i, to_create.instruction, to_create.index,
-                         to_create.is_phi);
-    const HloValue& value = values_.back();
-    GetValueSet(to_create.instruction, to_create.index).AddValue(&value);
-    if (value.is_phi()) {
-      phi_inputs_[&value] = {};
-    }
-  }
   return Status::OK();
 }
 
-bool HloDataflowAnalysis::IsDefinedBefore(const HloValue& a, const HloValue& b,
-                                          const HloOrdering& ordering) const {
-  // If 'b' is an entry param then 'a' cannot be defined before 'b' because 'b'
-  // is live into the module.
-  if (b.defining_instruction()->parent() == module_->entry_computation() &&
-      b.defining_instruction()->opcode() == HloOpcode::kParameter) {
-    return false;
-  }
-
-  // Phi values require special handling. Because XLA does not have a phi
-  // instruction, the definition instruction of the phis values are
-  // placeholders: either the subcomputation parameter (body or condition) or
-  // the while instruction. However, the program point where these values are
-  // logically defined does not necessarily coincide exactly with program point
-  // of these place-holder instructions. So we explicitly define the following
-  // order for phi values:
-  //
-  //   body/condition parameter phi:
-  //     Defined before all values defined in its computation excepting other
-  //     phis.
-  //
-  //   while phi:
-  //     defined after all values defined in the condition or body.
-  //
-  auto is_body_or_condition_phi = [](const HloValue& v) {
-    return v.is_phi() &&
-           v.defining_instruction()->opcode() == HloOpcode::kParameter;
-  };
-  if (is_body_or_condition_phi(a) && !is_body_or_condition_phi(b) &&
-      call_graph_->InstructionIsNestedIn(b.defining_instruction(),
-                                         a.defining_instruction()->parent())) {
-    return true;
-  }
-  if (is_body_or_condition_phi(b) &&
-      call_graph_->InstructionIsNestedIn(a.defining_instruction(),
-                                         b.defining_instruction()->parent())) {
-    return false;
-  }
-
-  // If 'b' is a while phi and 'a' is in the body or condition, then 'a'
-  // executes before 'b'.
-  if (b.is_phi() && b.defining_instruction()->opcode() == HloOpcode::kWhile &&
-      (call_graph_->InstructionIsNestedIn(
-           a.defining_instruction(), b.defining_instruction()->while_body()) ||
-       call_graph_->InstructionIsNestedIn(
-           a.defining_instruction(),
-           b.defining_instruction()->while_condition()))) {
-    return true;
-  }
-
-  return ordering.ExecutesBefore(a.defining_instruction(),
-                                 b.defining_instruction());
-}
-
-bool HloDataflowAnalysis::UseIsBeforeValueDefinition(
-    const HloUse& use, const HloValue& value,
-    const HloOrdering& ordering) const {
-  if (ordering.ExecutesBefore(use.instruction, value.defining_instruction())) {
-    return true;
-  }
-
-  // If the use is at the instruction where the value is defined, then the use
-  // is before the def if the instruction allows buffer sharing (in place
-  // computation).
-  if (use.instruction == value.defining_instruction() &&
-      CanShareOperandBufferWithUser(
-          use.instruction->mutable_operand(use.operand_number),
-          use.operand_index, value.defining_instruction(),
-          value.defining_index())) {
-    return true;
-  }
-
-  // The use at a while is an input to a phi, and logically occurs before values
-  // are defined in the body or condition computations.
-  if (use.instruction->opcode() == HloOpcode::kWhile) {
-    const HloInstruction* xla_while = use.instruction;
-    if (call_graph_->InstructionIsNestedIn(value.defining_instruction(),
-                                           xla_while->while_body()) ||
-        call_graph_->InstructionIsNestedIn(value.defining_instruction(),
-                                           xla_while->while_condition())) {
-      return true;
-    }
-  }
-
-  // Similarly if the value is defined at a while, it logically occurs after any
-  // uses in the body or condition computations.
-  if (value.defining_instruction()->opcode() == HloOpcode::kWhile) {
-    CHECK(ssa_form_);
-    const HloInstruction* xla_while = value.defining_instruction();
-    if (call_graph_->InstructionIsNestedIn(use.instruction,
-                                           xla_while->while_body()) ||
-        call_graph_->InstructionIsNestedIn(use.instruction,
-                                           xla_while->while_condition())) {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool HloDataflowAnalysis::LiveRangeStrictlyBefore(
-    const HloValue& a, const HloValue& b, const HloOrdering& ordering) const {
-  VLOG(4) << "LiveRangeStrictlyBefore(a = " << a.ToShortString()
-          << ", b = " << b.ToShortString() << ")";
-  if (!IsDefinedBefore(a, b, ordering)) {
-    VLOG(4) << "a not defined before b";
-    return false;
-  }
-
-  // Live-out values from the module can never have ranges strictly before any
-  // other value.
-  if (a.live_out_of_module()) {
-    VLOG(4) << "a is live out of module";
-    return false;
-  }
-
-  // Live-out values of computations can never have ranges strictly before any
-  // other value in the computation (including values nested in
-  // subcomputations).
-  if (a.live_out_of_computation() &&
-      call_graph_->InstructionIsNestedIn(b.defining_instruction(),
-                                         a.defining_instruction()->parent())) {
-    VLOG(4) << "a is live out of computation containing b";
-    return false;
-  }
-
-  // All uses of 'a' must be before 'b' is defined.
-  for (const HloUse& use : a.uses()) {
-    if (!UseIsBeforeValueDefinition(use, b, ordering)) {
-      VLOG(4) << "use of a (" << use << ") not before b is defined";
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool HloDataflowAnalysis::MayInterfere(const HloValue& a, const HloValue& b,
-                                       const HloOrdering& ordering) const {
-  // Buffers without disjoint liveness may interfere.
-  return !LiveRangeStrictlyBefore(a, b, ordering) &&
-         !LiveRangeStrictlyBefore(b, a, ordering);
-}
-
 /* static */
 StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
     HloModule* module, bool ssa_form, bool bitcast_defines_value) {
@@ -855,6 +619,33 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
   }
   dataflow_analysis->UpdateInstructionsAndPropagate(all_instructions);
 
+  // Add in positions to all values.
+  for (const std::unique_ptr<HloComputation>& computation :
+       module->computations()) {
+    for (const std::unique_ptr<HloInstruction>& instruction :
+         computation->instructions()) {
+      for (const auto& pair :
+           dataflow_analysis->GetInstructionValueSet(instruction.get())) {
+        const ShapeIndex& index = pair.first;
+        const HloValueSet& value_set = pair.second;
+        for (const HloValue* value : value_set.values()) {
+          if (value->defining_instruction() != instruction.get()) {
+            dataflow_analysis->GetValue(value->id())
+                .AddPosition(instruction.get(), index);
+          }
+        }
+      }
+    }
+  }
+
+  // Construct vector of values.
+  dataflow_analysis->values_vector_.reserve(dataflow_analysis->values_.size());
+  for (auto& pair : dataflow_analysis->values_) {
+    dataflow_analysis->values_vector_.push_back(&pair.second);
+  }
+  std::sort(dataflow_analysis->values_vector_.begin(),
+            dataflow_analysis->values_vector_.end(), HloValue::IdLessThan);
+
   TF_DCHECK_OK(dataflow_analysis->Verify());
 
   XLA_VLOG_LINES(1, dataflow_analysis->ToString());
@@ -865,14 +656,14 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
 Status HloDataflowAnalysis::Verify() const {
   // Verify each HloValue appears in the value sets that the value's positions()
   // indicate.
-  for (const HloValue& value : values()) {
-    for (const HloPosition& position : value.positions()) {
+  for (const HloValue* value : values()) {
+    for (const HloPosition& position : value->positions()) {
       const HloValueSet& value_set = GetValueSet(position);
       TF_RET_CHECK(std::find(value_set.values().begin(),
                              value_set.values().end(),
-                             &value) != value_set.values().end())
+                             value) != value_set.values().end())
           << "Value set at position " << position << " does not contain value "
-          << value.ToShortString();
+          << value->ToShortString();
     }
   }
 
@@ -898,75 +689,4 @@ Status HloDataflowAnalysis::Verify() const {
   return Status::OK();
 }
 
-Status HloDataflowAnalysis::VerifyAgainstReference() const {
-  TF_RETURN_IF_ERROR(Verify());
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> reference,
-                      Run(module_, ssa_form_, bitcast_defines_value_));
-  TF_RETURN_IF_ERROR(reference->Verify());
-
-  VLOG(2) << "This analysis:";
-  XLA_VLOG_LINES(2, ToString());
-  VLOG(2) << "Reference:";
-  XLA_VLOG_LINES(2, reference->ToString());
-
-  // Verify value sets in each position are identical.
-  for (const auto& computation : module_->computations()) {
-    for (const auto& instruction : computation->instructions()) {
-      for (const auto& pair : GetInstructionValueSet(instruction.get())) {
-        const ShapeIndex& index = pair.first;
-        const HloValueSet& value_set = pair.second;
-        const HloValueSet& reference_value_set =
-            reference->GetValueSet(instruction.get(), index);
-
-        auto value_in_set = [](const HloValue& v, const HloValueSet& vset) {
-          return std::find_if(vset.values().begin(), vset.values().end(),
-                              [&v](const HloValue* w) { return *w == v; }) !=
-                 vset.values().end();
-        };
-
-        for (const HloValue* value : value_set.values()) {
-          TF_RET_CHECK(value_in_set(*value, reference_value_set))
-              << "Value " << value->ToShortString()
-              << " does not exist in reference";
-        }
-        for (const HloValue* reference_value : reference_value_set.values()) {
-          TF_RET_CHECK(value_in_set(*reference_value, value_set))
-              << "Value " << reference_value->ToShortString()
-              << " only exists in reference";
-        }
-      }
-    }
-  }
-
-  // Verify all phis resolve identically and uses are identical.
-  for (const HloValue& value : values()) {
-    const HloValue& reference_value = reference->GetValueDefinedAt(
-        value.defining_instruction(), value.defining_index());
-    TF_RET_CHECK(value.is_phi() == reference_value.is_phi());
-    if (value.is_phi()) {
-      const HloValue* resolved_value = ResolvePhi(value);
-      const HloValue* reference_resolved_value =
-          reference->ResolvePhi(reference_value);
-      if (resolved_value == nullptr) {
-        TF_RET_CHECK(reference_resolved_value == nullptr);
-      } else {
-        TF_RET_CHECK(reference_resolved_value != nullptr);
-        TF_RET_CHECK(*reference_resolved_value == *resolved_value);
-      }
-    }
-
-    for (const HloUse& use : value.uses()) {
-      TF_RET_CHECK(std::find(reference_value.uses().begin(),
-                             reference_value.uses().end(),
-                             use) != reference_value.uses().end());
-    }
-    for (const HloUse& reference_use : reference_value.uses()) {
-      TF_RET_CHECK(std::find(value.uses().begin(), value.uses().end(),
-                             reference_use) != value.uses().end());
-    }
-  }
-  return Status::OK();
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 7781cc58a3a..aae257dd09e 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -88,10 +88,10 @@ class HloDataflowAnalysis {
   // given position.
   const HloValueSet& GetValueSet(const HloInstruction* instruction,
                                  const ShapeIndex& index = {}) const;
-  HloValueSet& GetValueSet(const HloInstruction* instruction,
-                           const ShapeIndex& index = {});
   const HloValueSet& GetValueSet(const HloPosition& position) const;
   HloValueSet& GetValueSet(const HloPosition& position);
+  HloValueSet& GetValueSet(const HloInstruction* instruction,
+                           const ShapeIndex& index = {});
 
   // Return the unique value in the HloValueSet at the given instruction and
   // shape index. CHECKs if the value set does not contain a exactly one value.
@@ -108,49 +108,11 @@ class HloDataflowAnalysis {
   const HloValue& GetValue(HloValue::Id value_id) const;
   HloValue& GetValue(HloValue::Id value_id);
 
-  // Returns whether the given values interfere assuming the given HLO
-  // ordering. Two values interfere if they may both be simultaneously live.
-  bool MayInterfere(const HloValue& a, const HloValue& b,
-                    const HloOrdering& ordering) const;
-
-  // Overload which takes HloValue:Ids.
-  bool MayInterfere(HloValue::Id a, HloValue::Id b,
-                    const HloOrdering& ordering) const {
-    return MayInterfere(GetValue(a), GetValue(b), ordering);
-  }
-
   // Return the total number of HloValues.
   int64 value_count() const { return values_.size(); }
 
-  // Return a vector of all HloValues.
-  const std::vector<HloValue>& values() const { return values_; }
-
-  // Updates the dataflow after the changing an operand of
-  // 'instruction'. Dataflow update is not possible if instructions have been
-  // added or removed from the graph.
-  void UpdateAfterChangingOperand(HloInstruction* instruction,
-                                  HloInstruction* old_operand,
-                                  HloInstruction* new_operand);
-
-  // Updates the dataflow after the changing the root of a computation from
-  // 'old_root' to 'new_root'.
-  void UpdateAfterChangingRoot(HloInstruction* old_root,
-                               HloInstruction* new_root);
-
-  // Returns the non-phi HloValue that is the unique (transitive) input to the
-  // given phi. If no such HloValue exists (there are multiple inputs to the
-  // phi) then nullptr is returned. This is computed by all walking the inputs
-  // of the given phi value until non-phi HloValue(s) are encountered.
-  const HloValue* ResolvePhi(const HloValue& phi) const;
-  const HloValue* ResolvePhi(const HloInstruction* instruction,
-                             const ShapeIndex& index = {}) const {
-    return ResolvePhi(GetValueDefinedAt(instruction, index));
-  }
-
-  // Compare the dataflow analysis against a clean recomputation of the
-  // analysis. Returns an error status if there is a mismatch. Useful for
-  // verifying the correctness after updates to the analysis.
-  Status VerifyAgainstReference() const;
+  // Return a vector of all HloValues stabily sorted by HloValue::Id.
+  const std::vector<const HloValue*>& values() const { return values_vector_; }
 
   // Return the call graph used for computing the dataflow.
   const CallGraph& call_graph() const { return *call_graph_; }
@@ -161,6 +123,13 @@ class HloDataflowAnalysis {
   HloDataflowAnalysis(HloModule* module, bool ssa_form,
                       bool bitcast_defines_value = false);
 
+  // Returns a new HloValue defined at the given instruction and shape index.
+  HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
+                        bool is_phi = false);
+
+  // Delete the HloValue with the given ID.
+  void DeleteHloValue(HloValue::Id value_id);
+
   // Constructs and initializes the InstructionValueSets of all instructions to
   // contain exactly the HloValues defined by each instruction. These values can
   // then propagated throughout the HLO graph by calling
@@ -187,10 +156,11 @@ class HloDataflowAnalysis {
   void UpdateInstructionsAndPropagate(
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions);
 
-  // Sets the inputs of the given phi to given value(s).
-  void UpdatePhiInputs(
-      const HloInstruction* instruction,
-      tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs);
+  // Return the result of the SSA Phi function applied to the given inputs at
+  // the given instruction. If skip_top_level is true, then the top level of the
+  // value set of 'instruction' is not modified.
+  bool Phi(HloInstruction* instruction,
+           tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs);
 
   // Updates the positions of the HloValues in the output of the given
   // instruction. This should be called after the instruction value set of
@@ -203,20 +173,6 @@ class HloDataflowAnalysis {
       HloInstruction* instruction, const InstructionValueSet& new_value_set,
       const InstructionValueSet* prev_value_set = nullptr);
 
-  // Returns true if the live range of the given value 'a' is strictly before
-  // the live range of value 'b' using the given HLO ordering.
-  bool LiveRangeStrictlyBefore(const HloValue& a, const HloValue& b,
-                               const HloOrdering& ordering) const;
-
-  // Returns whether the value 'a' is defined before the value 'b' under the
-  // given ordering.
-  bool IsDefinedBefore(const HloValue& a, const HloValue& b,
-                       const HloOrdering& ordering) const;
-
-  // Returns whether the given use is before the given value definition.
-  bool UseIsBeforeValueDefinition(const HloUse& use, const HloValue& value,
-                                  const HloOrdering& ordering) const;
-
   // Verify various invariants of the dataflow analysis.
   Status Verify() const;
 
@@ -226,19 +182,19 @@ class HloDataflowAnalysis {
 
   std::unique_ptr<CallGraph> call_graph_;
 
-  // Array of all values in the module. This is allocated once at analysis
-  // construction time so HloValue references are stable. Updates to the
-  // analysis via UpdateAfterChangingOperand and UpdateAfterChangingRoot do not
-  // result in the creation or destruction of any HloValues.
-  std::vector<HloValue> values_;
-
-  // Map hold the inputs to each phi value in the module. Used by ResolvePhi.
-  tensorflow::gtl::FlatMap<const HloValue*,
-                           tensorflow::gtl::InlinedVector<const HloValue*, 2>>
-      phi_inputs_;
+  // The map of all HloValues in the module. We pass around pointers to the
+  // mapped HloValues, so the underlying container must keep them valid despite
+  // mutations touching other map entries.
+  std::unordered_map<HloValue::Id, HloValue> values_;
 
   // A map from instruction to InstructionValueSet.
   std::unordered_map<const HloInstruction*, InstructionValueSet> value_sets_;
+
+  // A vector containing all HloValues sorted by HloValue::Id.
+  std::vector<const HloValue*> values_vector_;
+
+  // The Id to use for the next HloValue.
+  HloValue::Id next_value_id_ = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 9f3dd539efe..ef0fa1d745a 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -44,8 +43,8 @@ class HloDataflowAnalysisTest : public HloTestBase,
 
   // Run dataflow analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
-  HloDataflowAnalysis& RunAnalysis(bool ssa_form,
-                                   bool bitcast_defines_value = false) {
+  const HloDataflowAnalysis& RunAnalysis(bool ssa_form,
+                                         bool bitcast_defines_value = false) {
     analysis_ =
         HloDataflowAnalysis::Run(module_.get(), ssa_form, bitcast_defines_value)
             .ConsumeValueOrDie();
@@ -71,8 +70,8 @@ class HloDataflowAnalysisTest : public HloTestBase,
                                 const HloInstruction* b) {
     EXPECT_FALSE(ShapeUtil::IsTuple(a->shape()));
     EXPECT_FALSE(ShapeUtil::IsTuple(b->shape()));
-    return analysis_->MayInterfere(analysis_->GetValueDefinedAt(a),
-                                   analysis_->GetValueDefinedAt(b), ordering);
+    return ordering.MayInterfere(analysis_->GetValueDefinedAt(a),
+                                 analysis_->GetValueDefinedAt(b));
   }
 
   std::unique_ptr<HloModule> module_;
@@ -499,37 +498,26 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
   EXPECT_FALSE(analysis.GetValueDefinedAt(cond_constant).live_out_of_module());
 
   if (ssa_form) {
-    // While instruction should define phi values. The value at index {0} is a
-    // degenerate phi with a single input 'constant1'.
-    EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0}));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{0}).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(xla_while, /*index=*/{0}),
-              &analysis.GetValueDefinedAt(constant1));
-    EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param, /*index=*/{0}));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(body_param, /*index=*/{0}).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(body_param, /*index=*/{0}),
-              &analysis.GetValueDefinedAt(constant1));
-    EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{0}));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{0}).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(cond_param, /*index=*/{0}),
-              &analysis.GetValueDefinedAt(constant1));
+    // Element 0 of the tuple passed through the body so no phi value is
+    // defined.
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{0}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(body_param, /*index=*/{0}));
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{0}));
 
+    // Element 1 of the tuple should be a phi value.
     EXPECT_TRUE(analysis.ValueIsDefinedAt(xla_while, /*index=*/{1}));
     EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1}).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(xla_while, /*index=*/{1}), nullptr);
     EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param, /*index=*/{1}));
     EXPECT_TRUE(analysis.GetValueDefinedAt(body_param, /*index=*/{1}).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(body_param, /*index=*/{1}), nullptr);
     EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param, /*index=*/{1}));
     EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param, /*index=*/{1}).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(cond_param, /*index=*/{1}), nullptr);
 
-    EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-                UnorderedElementsAre(HloUse{xla_while, 0, {0}}));
+    EXPECT_THAT(
+        analysis.GetValueDefinedAt(constant1).uses(),
+        UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{xla_while, 0, {0}}));
 
-    EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
-    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{0})
-                    .live_out_of_module());
+    // Constant1 passes through the body and out of the module.
+    EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
     EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1})
                     .live_out_of_module());
 
@@ -613,20 +601,15 @@ TEST_P(HloDataflowAnalysisTest, SequentialWhiles) {
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
 
-  if (ssa_form) {
-    EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while2).live_out_of_module());
-    EXPECT_FALSE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
-  } else {
-    // Element 0 is passed through all the while instructions and out of the
-    // module.
-    EXPECT_EQ(analysis.GetUniqueValueAt(xla_while0, /*index=*/{0}),
-              analysis.GetValueDefinedAt(constant1));
-    EXPECT_EQ(analysis.GetUniqueValueAt(xla_while1, /*index=*/{0}),
-              analysis.GetValueDefinedAt(constant1));
-    EXPECT_EQ(analysis.GetUniqueValueAt(xla_while2, /*index=*/{0}),
-              analysis.GetValueDefinedAt(constant1));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
-  }
+  // Element 0 is passed through all the while instructions and out of the
+  // module..
+  EXPECT_EQ(analysis.GetUniqueValueAt(xla_while0, /*index=*/{0}),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(xla_while1, /*index=*/{0}),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(xla_while2, /*index=*/{0}),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
 }
 
 TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
@@ -705,13 +688,18 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
 
+  EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(negate)));
   if (ssa_form) {
     EXPECT_TRUE(analysis.ValueIsDefinedAt(inner_param, /*index=*/{1}));
     EXPECT_TRUE(
         analysis.GetValueDefinedAt(inner_param, /*index=*/{1}).is_phi());
-    EXPECT_TRUE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{0}));
-    EXPECT_TRUE(
-        analysis.GetValueDefinedAt(inner_param, /*index=*/{1}).is_phi());
+
+    // Element 0 of the nested while is %negate.
+    EXPECT_FALSE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{0}));
+    EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}),
+                UnorderedElementsAre(analysis.GetValueDefinedAt(negate)));
+    // Element 1 is a phi value (join of %add and %constant2).
     EXPECT_TRUE(analysis.ValueIsDefinedAt(nested_while, /*index=*/{1}));
     EXPECT_TRUE(
         analysis.GetValueDefinedAt(nested_while, /*index=*/{1}).is_phi());
@@ -724,8 +712,6 @@ TEST_P(HloDataflowAnalysisTest, NestedWhiles) {
     EXPECT_TRUE(
         analysis.GetValueDefinedAt(entry_while, /*index=*/{1}).is_phi());
   } else {
-    EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{0}),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(negate)));
     EXPECT_THAT(HloValuesAt(inner_param, /*index=*/{1}),
                 UnorderedElementsAre(analysis.GetValueDefinedAt(add),
                                      analysis.GetValueDefinedAt(constant2)));
@@ -1496,256 +1482,6 @@ TEST_P(HloDataflowAnalysisTest, EmbeddedComputationInterference) {
   EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, embedded_log));
 }
 
-TEST_P(HloDataflowAnalysisTest, UpdateAnalysisForWhile) {
-  // Test updating dataflow after modifying a module with an array shaped while:
-  //
-  // body(F32[]  %param):
-  //   %negate = Negate(%param)
-  //
-  // condition(F32[] %param):
-  //   return Constant(false)
-  //
-  // entry:
-  //   %constant = Constant(1.0)
-  //   %exp = Exp(%constant)
-  //   return While(%exp, body, condition)
-  //
-  auto body_builder = HloComputation::Builder("body");
-  auto body_param = body_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
-  auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
-      scalar_shape_, HloOpcode::kNegate, body_param));
-  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
-
-  // Condition computation trivially returns a constant "false".
-  auto cond_builder = HloComputation::Builder("condition");
-  auto cond_param = cond_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
-  cond_builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
-  HloComputation* condition =
-      module_->AddEmbeddedComputation(cond_builder.Build());
-
-  auto builder = HloComputation::Builder(TestName());
-  auto constant = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kExp, constant));
-  auto xla_while = builder.AddInstruction(
-      HloInstruction::CreateWhile(scalar_shape_, condition, body, exp));
-  module_->AddEntryComputation(builder.Build());
-
-  bool ssa_form = GetParam();
-  HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
-
-  // Sanity check the initial dataflow analysis before transforming the HLO
-  // graph.
-  if (ssa_form) {
-    EXPECT_TRUE(analysis.ValueIsDefinedAt(body_param));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(body_param).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(body_param), nullptr);
-
-    EXPECT_TRUE(analysis.ValueIsDefinedAt(cond_param));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(cond_param).is_phi());
-    EXPECT_EQ(analysis.ResolvePhi(cond_param), nullptr);
-
-    EXPECT_FALSE(analysis.GetValueDefinedAt(exp).live_out_of_module());
-    EXPECT_FALSE(analysis.GetValueDefinedAt(negate).live_out_of_module());
-  } else {
-    EXPECT_THAT(HloValuesAt(body_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(exp),
-                                     analysis.GetValueDefinedAt(negate)));
-    EXPECT_THAT(HloValuesAt(cond_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(exp),
-                                     analysis.GetValueDefinedAt(negate)));
-    EXPECT_THAT(HloValuesAt(xla_while),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(exp),
-                                     analysis.GetValueDefinedAt(negate)));
-
-    EXPECT_TRUE(analysis.GetValueDefinedAt(negate).live_out_of_module());
-    EXPECT_TRUE(analysis.GetValueDefinedAt(exp).live_out_of_module());
-  }
-
-  // Set the body root to the body_param. Previously it was Negate(body_param).
-  body->set_root_instruction(body_param);
-
-  // Prior to updating, verify that the dataflow analysis is no longer valid.
-  Status verify_status = analysis.VerifyAgainstReference();
-  EXPECT_FALSE(verify_status.ok());
-
-  analysis.UpdateAfterChangingRoot(/*old_root=*/negate,
-                                   /*new_root=*/body_param);
-
-  // Analysis should be valid after the update.
-  TF_EXPECT_OK(analysis.VerifyAgainstReference());
-
-  if (ssa_form) {
-    // The phis should now be resolvable as 'exp' is passed through the body
-    // transparently.
-    EXPECT_EQ(analysis.ResolvePhi(body_param),
-              &analysis.GetValueDefinedAt(exp));
-    EXPECT_EQ(analysis.ResolvePhi(cond_param),
-              &analysis.GetValueDefinedAt(exp));
-    EXPECT_EQ(analysis.ResolvePhi(xla_while), &analysis.GetValueDefinedAt(exp));
-    EXPECT_FALSE(analysis.GetValueDefinedAt(exp).live_out_of_module());
-  } else {
-    EXPECT_THAT(HloValuesAt(body_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(exp)));
-    EXPECT_THAT(HloValuesAt(cond_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(exp)));
-    EXPECT_THAT(HloValuesAt(xla_while),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(exp)));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(exp).live_out_of_module());
-  }
-  EXPECT_FALSE(analysis.GetValueDefinedAt(negate).live_out_of_module());
-
-  // Now replace the operand of the while with %constant (was %exp).
-  TF_ASSERT_OK(exp->ReplaceUseWith(xla_while, constant));
-  analysis.UpdateAfterChangingOperand(xla_while, /*old_operand=*/exp,
-                                      /*new_operand=*/constant);
-
-  // Verify that the dataflow is correct.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-
-  if (ssa_form) {
-    // The phis now resolve to 'constant'.
-    EXPECT_EQ(analysis.ResolvePhi(body_param),
-              &analysis.GetValueDefinedAt(constant));
-    EXPECT_EQ(analysis.ResolvePhi(cond_param),
-              &analysis.GetValueDefinedAt(constant));
-    EXPECT_EQ(analysis.ResolvePhi(xla_while),
-              &analysis.GetValueDefinedAt(constant));
-  } else {
-    EXPECT_THAT(HloValuesAt(body_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(constant)));
-    EXPECT_THAT(HloValuesAt(cond_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(constant)));
-    EXPECT_THAT(HloValuesAt(xla_while),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(constant)));
-    EXPECT_TRUE(analysis.GetValueDefinedAt(constant).live_out_of_module());
-  }
-
-  // And finally make the negate the root of the body again.
-  body->set_root_instruction(negate);
-  analysis.UpdateAfterChangingRoot(/*old_root=*/body_param,
-                                   /*new_root=*/negate);
-
-  // Verify that the dataflow is correct.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-
-  if (ssa_form) {
-    // Phis should no longer be resolvable.
-    EXPECT_EQ(analysis.ResolvePhi(body_param), nullptr);
-    EXPECT_EQ(analysis.ResolvePhi(cond_param), nullptr);
-    EXPECT_EQ(analysis.ResolvePhi(xla_while), nullptr);
-  } else {
-    EXPECT_THAT(HloValuesAt(body_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(constant),
-                                     analysis.GetValueDefinedAt(negate)));
-    EXPECT_THAT(HloValuesAt(cond_param),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(constant),
-                                     analysis.GetValueDefinedAt(negate)));
-    EXPECT_THAT(HloValuesAt(xla_while),
-                UnorderedElementsAre(analysis.GetValueDefinedAt(constant),
-                                     analysis.GetValueDefinedAt(negate)));
-
-    EXPECT_FALSE(analysis.GetValueDefinedAt(exp).live_out_of_module());
-    EXPECT_TRUE(analysis.GetValueDefinedAt(negate).live_out_of_module());
-    EXPECT_TRUE(analysis.GetValueDefinedAt(constant).live_out_of_module());
-  }
-
-  // After the updates, verify that the dataflow is correct.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-}
-
-TEST_P(HloDataflowAnalysisTest, UpdateOfATupleSelect) {
-  // Test changing the operands of kSelects of a tuple value and updating the
-  // dataflow.
-  auto builder = HloComputation::Builder(TestName());
-  auto pred = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
-  auto a = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto b = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
-  auto c = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.0)));
-  auto d = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(4.0)));
-  auto tuple_a = builder.AddInstruction(HloInstruction::CreateTuple({a}));
-  auto tuple_b = builder.AddInstruction(HloInstruction::CreateTuple({b}));
-  auto tuple_c = builder.AddInstruction(HloInstruction::CreateTuple({c}));
-  auto tuple_d = builder.AddInstruction(HloInstruction::CreateTuple({d}));
-  const Shape tuple_shape = tuple_a->shape();
-  auto select_aa = builder.AddInstruction(HloInstruction::CreateTernary(
-      tuple_shape, HloOpcode::kSelect, pred, tuple_a, tuple_a));
-  auto select_ab = builder.AddInstruction(HloInstruction::CreateTernary(
-      tuple_shape, HloOpcode::kSelect, pred, tuple_a, tuple_b));
-  auto select_cd = builder.AddInstruction(HloInstruction::CreateTernary(
-      tuple_shape, HloOpcode::kSelect, pred, tuple_c, tuple_d));
-  auto select_abcd = builder.AddInstruction(HloInstruction::CreateTernary(
-      tuple_shape, HloOpcode::kSelect, pred, select_ab, select_cd));
-
-  module_->AddEntryComputation(builder.Build());
-
-  bool ssa_form = GetParam();
-  HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
-
-  // Sanity check dataflow before changing the graph and updating.
-  EXPECT_THAT(HloValuesAt(select_aa, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(a)));
-  EXPECT_THAT(HloValuesAt(select_ab, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(a),
-                                   analysis.GetValueDefinedAt(b)));
-  EXPECT_THAT(HloValuesAt(select_cd, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(c),
-                                   analysis.GetValueDefinedAt(d)));
-  EXPECT_THAT(HloValuesAt(select_abcd, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(a),
-                                   analysis.GetValueDefinedAt(b),
-                                   analysis.GetValueDefinedAt(c),
-                                   analysis.GetValueDefinedAt(d)));
-  EXPECT_TRUE(analysis.GetValueDefinedAt(a).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(b).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(c).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(d).live_out_of_module());
-
-  // Set the rhs of 'select_aa' to be 'd'.
-  TF_ASSERT_OK(select_aa->ReplaceOperandWith(2, tuple_d));
-  analysis.UpdateAfterChangingOperand(select_aa, /*old_operand=*/tuple_a,
-                                      /*new_operand=*/tuple_d);
-
-  // Verify that the dataflow is correct.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-
-  EXPECT_THAT(HloValuesAt(select_aa, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(a),
-                                   analysis.GetValueDefinedAt(d)));
-
-  // Set the lhs of 'select_cd' to be 'a'.
-  TF_ASSERT_OK(select_cd->ReplaceOperandWith(1, tuple_a));
-  analysis.UpdateAfterChangingOperand(select_cd, /*old_operand=*/tuple_c,
-                                      /*new_operand=*/tuple_a);
-
-  // Verify that the dataflow is correct.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-
-  EXPECT_THAT(HloValuesAt(select_cd, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(a),
-                                   analysis.GetValueDefinedAt(d)));
-  EXPECT_THAT(HloValuesAt(select_abcd, /*index=*/{0}),
-              UnorderedElementsAre(analysis.GetValueDefinedAt(a),
-                                   analysis.GetValueDefinedAt(b),
-                                   analysis.GetValueDefinedAt(d)));
-  EXPECT_TRUE(analysis.GetValueDefinedAt(a).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(b).live_out_of_module());
-  EXPECT_FALSE(analysis.GetValueDefinedAt(c).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(d).live_out_of_module());
-
-  // After the updates, verify that the dataflow is correct.
-  TF_ASSERT_OK(analysis.VerifyAgainstReference());
-}
-
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index dfb111d1d0b..07b3369d5c1 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -561,13 +561,21 @@ tooltip = " ";
   }
 
   string comp_body = DumpComputation(subcomp);
-  string computation =
-      Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
 
-  // Add an edge from the subcomputation to its parent node.  If subcomp
-  // belongs to a fusion node, it's drawn in place of the fusion instruction, so
-  // there's no need to link those.
-  if (parent_instr->opcode() != HloOpcode::kFusion) {
+  if (parent_instr->opcode() == HloOpcode::kFusion) {
+    // Dump any nested fusion nodes.
+    for (const auto& subcomp_instr : subcomp->instructions()) {
+      if (subcomp_instr->opcode() == HloOpcode::kFusion) {
+        StrAppend(
+            &comp_body,
+            DumpSubcomputation(subcomp_instr->fused_instructions_computation(),
+                               subcomp_instr.get()));
+      }
+    }
+  } else {
+    // Add an edge from the subcomputation to its parent node.  If subcomp
+    // belongs to a fusion node, it's drawn in place of the fusion instruction,
+    // so there's no need to link those.
     edge_ids_.insert(
         {{subcomp->root_instruction(), parent_instr}, next_edge_id_++});
     const char* edge_fmt =
@@ -578,6 +586,9 @@ tooltip = " ";
                subcomp->name(), parent_instr->name()));
   }
 
+  string computation =
+      Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
+
   return computation;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 3bdb67ba923..24ef4e09e7c 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -793,13 +793,6 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     }
   }
 
-  for (HloComputation* computation :
-       instruction_to_fuse->called_computations()) {
-    if (std::find(called_computations_.begin(), called_computations_.end(),
-                  computation) == called_computations_.end()) {
-      called_computations_.push_back(computation);
-    }
-  }
   VLOG(2) << "New clone:\n" << clone->ToString();
   return clone;
 }
@@ -1669,6 +1662,21 @@ string HloInstruction::ExtendedOpcodeStr() const {
 
 string HloInstruction::ToString(bool compact_operands,
                                 bool include_metadata) const {
+  string result =
+      StrCat(name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ",
+             ExtendedOpcodeStr(), "(", OperandsToString(compact_operands), ")");
+  for (const string& extra : ExtraAttributesToString()) {
+    StrAppend(&result, ", ", extra);
+  }
+  if (include_metadata &&
+      (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
+       !metadata_.source_file().empty())) {
+    StrAppend(&result, " # metadata=", metadata_.ShortDebugString());
+  }
+  return result;
+}
+
+string HloInstruction::OperandsToString(bool compact) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
@@ -1697,12 +1705,12 @@ string HloInstruction::ToString(bool compact_operands,
   } else {
     tensorflow::gtl::ArraySlice<HloInstruction*> slice(operands_);
     const int64 kMaxOperandsToShowIfCompact = 4;
-    if (compact_operands && slice.size() > kMaxOperandsToShowIfCompact) {
+    if (compact && slice.size() > kMaxOperandsToShowIfCompact) {
       slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact);
     }
     operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) {
       *out += ShapeUtil::HumanStringWithLayout(operand->shape());
-      if (!compact_operands) {
+      if (!compact) {
         StrAppend(out, " ", operand->name());
       }
     });
@@ -1711,15 +1719,19 @@ string HloInstruction::ToString(bool compact_operands,
       StrAppend(&operands, ", ...(+", remaining, ")");
     }
   }
-  string extra;
+  return operands;
+}
+
+std::vector<string> HloInstruction::ExtraAttributesToString() const {
+  std::vector<string> extra;
   if (CanHaveDimensionsField()) {
-    StrAppend(&extra, ", dimensions={", Join(dimensions(), ","), "}");
+    extra.push_back(StrCat("dimensions={", Join(dimensions(), ","), "}"));
   }
   if (window_ != nullptr) {
-    StrAppend(&extra, ", ", window_util::ToString(*window_));
+    extra.push_back(window_util::ToString(*window_));
   }
   if (padding_config_ != nullptr) {
-    StrAppend(&extra, ", padding=", padding_config_->ShortDebugString());
+    extra.push_back(StrCat("padding=", padding_config_->ShortDebugString()));
   }
   if (!slice_starts_.empty() && !slice_limits_.empty()) {
     std::vector<string> bounds;
@@ -1728,45 +1740,38 @@ string HloInstruction::ToString(bool compact_operands,
       bounds.push_back(
           StrCat("[", slice_starts_[i], ":", slice_limits_[i], "]"));
     }
-    StrAppend(&extra, ", slice={", Join(bounds, ", "), "}");
+    extra.push_back(StrCat("slice={", Join(bounds, ", "), "}"));
   }
 
   if (convolution_dimension_numbers_ != nullptr) {
-    StrAppend(&extra, ", ", ConvolutionDimensionNumbersToString());
+    extra.push_back(ConvolutionDimensionNumbersToString());
   }
 
   if (opcode() == HloOpcode::kWhile) {
-    StrAppend(&extra, ", condition=", while_condition()->name());
-    StrAppend(&extra, ", body=", while_body()->name());
+    extra.push_back(StrCat("condition=", while_condition()->name()));
+    extra.push_back(StrCat("body=", while_body()->name()));
   } else if (opcode() == HloOpcode::kSelectAndScatter) {
-    StrAppend(&extra, ", select=", select()->name());
-    StrAppend(&extra, ", scatter=", scatter()->name());
+    extra.push_back(StrCat("select=", select()->name()));
+    extra.push_back(StrCat("scatter=", scatter()->name()));
   } else if (!called_computations().empty()) {
-    StrAppend(&extra, ", calls=",
-              Join(called_computations(), ", ",
-                   [](string* out, const HloComputation* computation) {
-                     StrAppend(out, computation->name());
-                   }));
+    extra.push_back(StrCat(
+        "calls=", Join(called_computations(), ", ",
+                       [](string* out, const HloComputation* computation) {
+                         StrAppend(out, computation->name());
+                       })));
   }
 
   if (opcode() == HloOpcode::kGetTupleElement) {
-    StrAppend(&extra, ", index=", tuple_index());
+    extra.push_back(StrCat("index=", tuple_index()));
   }
   if (!control_successors_.empty()) {
-    StrAppend(
-        &extra, ", control-successors=",
+    extra.push_back(StrCat(
+        "control-successors=",
         Join(control_successors_, ", ", [](string* out, HloInstruction* succ) {
           StrAppend(out, succ->name());
-        }));
+        })));
   }
-  if (include_metadata &&
-      (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
-       !metadata_.source_file().empty())) {
-    StrAppend(&extra, " # metadata=", metadata_.ShortDebugString());
-  }
-
-  return StrCat(name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ",
-                ExtendedOpcodeStr(), "(", operands, ")", extra);
+  return extra;
 }
 
 string HloInstruction::ToShortString() const {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5688fcc4255..ca6f27bd40e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -548,6 +548,14 @@ class HloInstruction {
   string ToString(bool compact_operands = false,
                   bool include_metadata = true) const;
 
+  // Components of the ToString() representation:
+
+  // Returns a string representation of the operand list.
+  string OperandsToString(bool compact) const;
+
+  // Returns string representation of op-specific attributes.
+  std::vector<string> ExtraAttributesToString() const;
+
   string ToStringNoMetadata() const { return ToString(false, false); }
 
   // As ToString, but returns a shorter string.
@@ -797,8 +805,7 @@ class HloInstruction {
       const Shape& shape,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands);
 
-  // Returns the computations this instruction calls (if any). This includes
-  // computations called by fused instructions inside of a fusion instruction.
+  // Returns the computations this instruction directly calls (if any).
   const std::vector<HloComputation*>& called_computations() const {
     return called_computations_;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index ea5749581b5..2e1eeee36b5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -758,16 +758,13 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   auto* fusion = computation->CreateFusionInstruction(
       {map_3_y}, HloInstruction::FusionKind::kLoop);
   auto* fused_computation = fusion->fused_instructions_computation();
-  EXPECT_THAT(fusion->called_computations(),
-              ElementsAre(fused_computation, computation_y));
+  EXPECT_THAT(fusion->called_computations(), ElementsAre(fused_computation));
 
   fusion->FuseInstruction(map_2_x);
-  EXPECT_THAT(fusion->called_computations(),
-              ElementsAre(fused_computation, computation_y, computation_x));
+  EXPECT_THAT(fusion->called_computations(), ElementsAre(fused_computation));
 
   fusion->FuseInstruction(map_1_x);
-  EXPECT_THAT(fusion->called_computations(),
-              ElementsAre(fused_computation, computation_y, computation_x));
+  EXPECT_THAT(fusion->called_computations(), ElementsAre(fused_computation));
 }
 
 TEST_F(HloInstructionTest, ComplexFusionOp) {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index ad6070a9c1b..c95e44bd5d9 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
@@ -218,6 +219,94 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
   EXPECT_FALSE(ordering.ExecutesBefore(body_param, cond_param));
 }
 
+TEST_F(HloOrderingTest, ValuesInWhileComputations) {
+  // Tests the ordering of values (defined by dataflow analysis) in the body and
+  // condition of a while instruction. HLO code:
+  //
+  // body(F32[]) %param):
+  //   %negate = Negate(%param)
+  //
+  // condition(F32[] %param):
+  //   %convert = Convert<PRED>(%param)
+  //
+  // entry:
+  //   %constant = Constant(1.0)
+  //   %while = While(%constant, body, condition)
+  //   %add = Add(%constant, %while)
+  //
+  auto module = CreateNewModule();
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "body_param"));
+  auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape, HloOpcode::kNegate, body_param));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "cond_param"));
+  auto convert = cond_builder.AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::MakeShape(xla::PRED, {}), cond_param));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(scalar_shape, condition, body, constant));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, constant, xla_while));
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto dataflow, HloDataflowAnalysis::Run(module.get(), /*ssa_form=*/true));
+  DependencyHloOrdering ordering(module.get());
+
+  // Init value is defined before the while, but live range is not before the
+  // while because of the use of the init value in the add.
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(constant),
+                                       dataflow->GetValueDefinedAt(xla_while)));
+  EXPECT_FALSE(
+      ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(constant),
+                                       dataflow->GetValueDefinedAt(xla_while)));
+  EXPECT_TRUE(ordering.MayInterfere(dataflow->GetValueDefinedAt(constant),
+                                    dataflow->GetValueDefinedAt(xla_while)));
+
+  // Any value defined in the body or condition is defined before the while, and
+  // has a live range strictly before the while.
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(negate),
+                                       dataflow->GetValueDefinedAt(xla_while)));
+  EXPECT_TRUE(
+      ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(negate),
+                                       dataflow->GetValueDefinedAt(xla_while)));
+  EXPECT_FALSE(ordering.MayInterfere(dataflow->GetValueDefinedAt(negate),
+                                     dataflow->GetValueDefinedAt(xla_while)));
+
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(convert),
+                                       dataflow->GetValueDefinedAt(xla_while)));
+  EXPECT_TRUE(
+      ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(convert),
+                                       dataflow->GetValueDefinedAt(xla_while)));
+  EXPECT_FALSE(ordering.MayInterfere(dataflow->GetValueDefinedAt(convert),
+                                     dataflow->GetValueDefinedAt(xla_while)));
+
+  // The live range of the while should be before the add.
+  EXPECT_TRUE(ordering.IsDefinedBefore(dataflow->GetValueDefinedAt(xla_while),
+                                       dataflow->GetValueDefinedAt(add)));
+  ASSERT_EQ(dataflow->GetValueDefinedAt(xla_while).uses().size(), 1);
+
+  const HloUse& while_use = dataflow->GetValueDefinedAt(xla_while).uses()[0];
+  EXPECT_EQ(while_use.instruction, add);
+  EXPECT_TRUE(ordering.UseIsBeforeValueDefinition(
+      while_use, dataflow->GetValueDefinedAt(add)));
+  EXPECT_TRUE(
+      ordering.LiveRangeStrictlyBefore(dataflow->GetValueDefinedAt(xla_while),
+                                       dataflow->GetValueDefinedAt(add)));
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index eb3da111a24..7ad33c8947c 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 
 namespace xla {
 
@@ -54,11 +55,18 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
             << tensorflow::str_util::Join(disabled_passes, ", ");
   }
 
-  auto run_invariant_checkers = [this, module]() -> Status {
+  auto run_invariant_checkers = [this,
+                                 module](const string& message) -> Status {
     for (auto& invariant_checker : invariant_checkers_) {
       VLOG(1) << "    Invariant checker " << invariant_checker->name();
-      TF_ASSIGN_OR_RETURN(bool changed, invariant_checker->Run(module));
-      TF_RET_CHECK(!changed) << "invariant checkers must not change the graph";
+      StatusOr<bool> changed_status = invariant_checker->Run(module);
+      if (!changed_status.ok()) {
+        return Status(changed_status.status().code(),
+                      StrCat(changed_status.status().error_message(),
+                             "\n\nFailed ", message));
+      }
+      TF_RET_CHECK(!changed_status.ValueOrDie())
+          << "invariant checkers must not change the graph";
     }
     return Status::OK();
   };
@@ -66,6 +74,8 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   string prefix = name().ToString() + ": pipeline start";
   bool changed = false;
   string message;
+  TF_RETURN_IF_ERROR(
+      run_invariant_checkers(StrCat("before running pipeline: ", name())));
   for (auto& pass : passes_) {
     if (disabled_passes.count(pass->name().ToString()) > 0) {
       VLOG(1) << "  Skipping HLO pass " << pass->name()
@@ -80,14 +90,14 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     StrAppend(&message, prefix, ", before ", pass->name());
     DumpModule(*module, message);
 
-    TF_RETURN_IF_ERROR(run_invariant_checkers());
     TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module));
+    TF_RETURN_IF_ERROR(
+        run_invariant_checkers(StrCat("after running pass: ", pass->name())));
 
     changed |= changed_this_pass;
     prefix.clear();
     StrAppend(&prefix, name(), ": after ", pass->name());
   }
-  TF_RETURN_IF_ERROR(run_invariant_checkers());
   DumpModule(*module, prefix + ", pipeline end");
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 278a1d7efad..6e5d7bca75c 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1202,7 +1202,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
 StatusOr<bool> HloRematerialization::Run(
     HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence,
-    int64 memory_limit_bytes) {
+    int64 memory_limit_bytes, RematerializationSizes* sizes) {
   // The sequence is constructed entirely by this method.
   TF_RET_CHECK(sequence->empty());
 
@@ -1248,7 +1248,8 @@ StatusOr<bool> HloRematerialization::Run(
                                 sequence->at(node.computation())));
         }
         return Status::OK();
-      }));
+      },
+      /*visit_unreachable_nodes=*/false));
 
   // The peak memory usage of the module equals the peak memory use of the entry
   // computation plus the output size of the computation. This is because the
@@ -1318,13 +1319,20 @@ StatusOr<bool> HloRematerialization::Run(
           << HumanReadableNumBytes(reduced_peak_memory) << " ("
           << reduced_peak_memory << " bytes)";
 
+  if (sizes != nullptr) {
+    sizes->before_bytes = before_peak_memory;
+    sizes->after_bytes = current_peak_memory;
+  }
+
   XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString());
 
   if (current_peak_memory > memory_limit_bytes) {
-    LOG(WARNING) << "Can't reduce memory use below "
-                 << HumanReadableNumBytes(memory_limit_bytes)
-                 << " by rematerialization (only reduced to "
-                 << HumanReadableNumBytes(current_peak_memory) << ")";
+    LOG(WARNING) << tensorflow::strings::Printf(
+        "Can't reduce memory use below %s (%lld bytes) by rematerialization; "
+        "only reduced to %s (%lld bytes)",
+        HumanReadableNumBytes(memory_limit_bytes).c_str(), memory_limit_bytes,
+        HumanReadableNumBytes(current_peak_memory).c_str(),
+        current_peak_memory);
   }
 
   return changed;
@@ -1333,9 +1341,10 @@ StatusOr<bool> HloRematerialization::Run(
 /* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
     const HloRematerialization::ShapeSizeFunction& size_function,
     int64 memory_limit_bytes, HloModule* hlo_module,
-    SequentialHloOrdering::HloModuleSequence* sequence) {
+    SequentialHloOrdering::HloModuleSequence* sequence,
+    RematerializationSizes* sizes) {
   HloRematerialization remat(size_function);
-  return remat.Run(hlo_module, sequence, memory_limit_bytes);
+  return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 42c279d440b..11f79a6d415 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -28,6 +28,13 @@ class HloRematerialization {
  public:
   using ShapeSizeFunction = std::function<int64(const Shape&)>;
 
+  // Helper struct that communicates the before / after sizes for the
+  // rematerialization process.
+  struct RematerializationSizes {
+    int64 before_bytes;
+    int64 after_bytes;
+  };
+
   // Rematerialize HLO instructions in the given module to reduce peak memory
   // use below memory_limit_bytes where memory use is defined as the total size
   // of all live HLO instruction values. Parameters and constants are included
@@ -46,6 +53,9 @@ class HloRematerialization {
   //     rematerialization. This is the order in which HLO instructions should
   //     be emitted to minimize memory use.
   //
+  //   sizes: Optional outparam that indicates the peak memory usage of the HLO
+  //     module before/after rematerialization.
+  //
   // Returns whether any instructions were rematerialized. If memory use is
   // already below the given limit then no instructions are rematerialized and
   // false is returned.
@@ -55,8 +65,8 @@ class HloRematerialization {
   // code generation.
   static StatusOr<bool> RematerializeAndSchedule(
       const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
-      HloModule* hlo_module,
-      SequentialHloOrdering::HloModuleSequence* sequence);
+      HloModule* hlo_module, SequentialHloOrdering::HloModuleSequence* sequence,
+      RematerializationSizes* sizes = nullptr);
 
  protected:
   HloRematerialization(const ShapeSizeFunction& size_function)
@@ -69,7 +79,7 @@ class HloRematerialization {
   // contains the memory-minimizing order in which to emit the HLO instructions.
   StatusOr<bool> Run(HloModule* module,
                      SequentialHloOrdering::HloModuleSequence* sequence,
-                     int64 memory_limit);
+                     int64 memory_limit, RematerializationSizes* sizes);
 
   // Rematerializes instructions within the given computation. 'order' is the
   // order in which the computation's instructions will be emitted in the
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index f85d8ec50de..e6cf0d37b8a 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -159,12 +159,6 @@ void HloValue::AddPosition(HloInstruction* instruction,
   for (const HloPosition& position : positions_) {
     DCHECK_NE(position, new_position);
   }
-  // The shape of the new position must match existing positions.
-  if (!positions_.empty()) {
-    CHECK(
-        ShapeUtil::Compatible(positions_.front().shape(), new_position.shape()))
-        << "front: " << positions_.front() << " new: " << new_position;
-  }
 
   positions_.push_back(std::move(new_position));
 
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index 63ecc25020b..6872bc76a82 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -225,6 +225,9 @@ class HloValueSet {
   // already exist in the set.
   bool AddValue(const HloValue* value);
 
+  // Clear all values from the set.
+  void Clear() { values_.clear(); }
+
   // Return the unique HLO value in the set. CHECKs if the set does not contain
   // exactly one value.
   const HloValue& GetUniqueValue() const {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 9ba2d54d024..c44be716cdf 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -32,13 +32,11 @@ class ShapeVerifier : public DfsHloVisitor {
       const std::function<int64(const Shape&)>& shape_size_fn)
       : shape_size_fn_(shape_size_fn) {}
 
-  Status HandleElementwiseUnary(HloInstruction* hlo,
-                                HloOpcode opcode) override {
+  Status HandleElementwiseUnary(HloInstruction* hlo) override {
     return CheckUnaryShape(hlo);
   }
 
-  Status HandleElementwiseBinary(HloInstruction* hlo,
-                                 HloOpcode opcode) override {
+  Status HandleElementwiseBinary(HloInstruction* hlo) override {
     return CheckBinaryShape(hlo);
   }
 
@@ -282,6 +280,14 @@ class ShapeVerifier : public DfsHloVisitor {
   const std::function<int64(const Shape&)> shape_size_fn_;
 };
 
+string ComputationsToString(
+    tensorflow::gtl::ArraySlice<HloComputation*> computations) {
+  return tensorflow::str_util::Join(
+      computations, ",", [](string* s, const HloComputation* computation) {
+        s->append(computation->name());
+      });
+}
+
 }  // namespace
 
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
@@ -292,6 +298,17 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
     for (const auto& instruction : computation->instructions()) {
       TF_RET_CHECK(instruction->parent() == computation.get());
       if (instruction->opcode() == HloOpcode::kFusion) {
+        TF_RET_CHECK(
+            ContainersEqual(instruction->called_computations(),
+                            {instruction->fused_instructions_computation()}))
+            << "Fusion HLO calls computations other than the "
+               "fused_instructions_computation: "
+            << instruction->ToString()
+            << " instruction->fused_instructions_computation(): "
+            << instruction->fused_instructions_computation()->ToString()
+            << " instruction->called_computations(): "
+            << ComputationsToString(instruction->called_computations());
+
         for (const auto& fused : instruction->fused_instructions()) {
           TF_RET_CHECK(fused->parent() ==
                        instruction->fused_instructions_computation())
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index 01dbb7e8663..8275531111c 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -122,7 +122,9 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_inputs(
         continue;
       }
 
-      if (instruction->opcode() == HloOpcode::kFusion) {
+      if (instruction->opcode() == HloOpcode::kFusion &&
+          (instruction->fusion_kind() == HloInstruction::FusionKind::kLoop ||
+           instruction->fusion_kind() == HloInstruction::FusionKind::kInput)) {
         // Insert the reduce-precision operation inside the fusion computation,
         // after the corresponding parameter instruction.
         TF_ASSIGN_OR_RETURN(
@@ -171,7 +173,9 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_outputs(
       continue;
     }
 
-    if (instruction->opcode() == HloOpcode::kFusion) {
+    if (instruction->opcode() == HloOpcode::kFusion &&
+        (instruction->fusion_kind() == HloInstruction::FusionKind::kLoop ||
+         instruction->fusion_kind() == HloInstruction::FusionKind::kOutput)) {
       // Insert the reduce-precision operation as the last operation inside
       // the fusion computation.
       HloInstruction* fusion_root = instruction->fused_expression_root();
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 52b2027aece..9f7ae4ae873 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -215,6 +215,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index 407b5f4ada5..b61544466a1 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdlib>
 
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/compiler/xla/tests/filecheck.h b/tensorflow/compiler/xla/tests/filecheck.h
index 599bf57ad32..493ff7414bd 100644
--- a/tensorflow/compiler/xla/tests/filecheck.h
+++ b/tensorflow/compiler/xla/tests/filecheck.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index a946d335ca6..da39ba3ffc3 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -111,6 +111,11 @@ cc_binary(
     deps = [
         ":replay_computation_library",
         "//tensorflow/compiler/plugin/executor:plugin_lib",
+        # TODO: This dependency is a workaround for linking error with clang.
+        # Without it, linker complains about missing symbols from
+        # 'xla_device_launch_op'. This dependency should be propagated from
+        # plugin_lib instead, but no targets other than this break without it.
+        "//tensorflow/compiler/jit",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index bd93e114b73..89b26b8916b 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -144,7 +144,7 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args,
 
 int main(int argc, char** argv) {
   // Flags
-  string fake_infeed_shape;
+  xla::string fake_infeed_shape;
   bool use_fake_data = false;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("use_fake_data", &use_fake_data,
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 47a0f54a023..84fcc0d0149 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -28,6 +28,7 @@ py_library(
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/fused_conv:fused_conv_py",
+        "//tensorflow/contrib/gan",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/contrib/grid_rnn:grid_rnn_py",
         "//tensorflow/contrib/hooks",
@@ -72,6 +73,7 @@ py_library(
         "//tensorflow/contrib/staging",
         "//tensorflow/contrib/stat_summarizer:stat_summarizer_py",
         "//tensorflow/contrib/stateless",
+        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/contrib/tensor_forest:init_py",
         "//tensorflow/contrib/tensorboard",
         "//tensorflow/contrib/testing:testing_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 315ea943cf3..d1d0e2823ad 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -31,6 +31,7 @@ from tensorflow.contrib import deprecated
 from tensorflow.contrib import distributions
 from tensorflow.contrib import factorization
 from tensorflow.contrib import framework
+from tensorflow.contrib import gan
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib import grid_rnn
 from tensorflow.contrib import image
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index c377c50e9fe..a8b60460c8f 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import os
+
 from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.decision_trees.proto import generic_tree_model_extensions_pb2
@@ -26,18 +29,21 @@ from tensorflow.contrib.learn.python.learn import export_strategy
 from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.saved_model import loader as saved_model_loader
 from tensorflow.python.saved_model import tag_constants
 
 
-def make_custom_export_strategy(name, convert_fn, feature_columns,
+def make_custom_export_strategy(name,
+                                convert_fn,
+                                feature_columns,
                                 export_input_fn):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
     name: A string, for the name of the export strategy.
     convert_fn: A function that converts the tree proto to desired format and
-      saves it to the desired location.
+      saves it to the desired location. Can be None to skip conversion.
     feature_columns: A list of feature columns.
     export_input_fn: A function that takes no arguments and returns an
       `InputFnOps`.
@@ -68,9 +74,22 @@ def make_custom_export_strategy(name, convert_fn, feature_columns,
         dtec = tree_config_pb2.DecisionTreeEnsembleConfig()
         dtec.ParseFromString(dfec_str)
         # Export the result in the same folder as the saved model.
-        convert_fn(dtec, sorted_feature_names, len(dense_floats),
-                   len(sparse_float_indices), len(sparse_int_indices),
-                   result_dir, eval_result)
+        if convert_fn:
+          convert_fn(dtec, sorted_feature_names,
+                     len(dense_floats),
+                     len(sparse_float_indices),
+                     len(sparse_int_indices), result_dir, eval_result)
+        feature_importances = _get_feature_importances(
+            dtec, sorted_feature_names,
+            len(dense_floats),
+            len(sparse_float_indices), len(sparse_int_indices))
+        sorted_by_importance = sorted(
+            feature_importances.items(), key=lambda x: -x[1])
+        assets_dir = os.path.join(result_dir, "assets.extra")
+        gfile.MakeDirs(assets_dir)
+        with gfile.GFile(os.path.join(assets_dir, "feature_importances"),
+                         "w") as f:
+          f.write("\n".join("%s, %f" % (k, v) for k, v in sorted_by_importance))
     return result_dir
   return export_strategy.ExportStrategy(name, export_fn)
 
@@ -157,3 +176,41 @@ def convert_to_universal_format(dtec, sorted_feature_names,
         node.left_child_id.value = split.left_id
         node.right_child_id.value = split.right_id
   return model_and_features
+
+
+def _get_feature_importances(dtec, feature_names, num_dense_floats,
+                             num_sparse_float, num_sparse_int):
+  """Export the feature importance per feature column."""
+  del num_sparse_int    # Unused.
+  sums = collections.defaultdict(lambda: 0)
+  for tree_idx in range(len(dtec.trees)):
+    tree = dtec.trees[tree_idx]
+    for tree_node in tree.nodes:
+      node_type = tree_node.WhichOneof("node")
+      if node_type == "dense_float_binary_split":
+        split = tree_node.dense_float_binary_split
+        split_column = feature_names[split.feature_column]
+      elif node_type == "sparse_float_binary_split_default_left":
+        split = tree_node.sparse_float_binary_split_default_left.split
+        split_column = feature_names[split.feature_column + num_dense_floats]
+      elif node_type == "sparse_float_binary_split_default_right":
+        split = tree_node.sparse_float_binary_split_default_right.split
+        split_column = feature_names[split.feature_column + num_dense_floats]
+      elif node_type == "categorical_id_binary_split":
+        split = tree_node.categorical_id_binary_split
+        split_column = feature_names[split.feature_column + num_dense_floats +
+                                     num_sparse_float]
+      elif node_type == "categorical_id_set_membership_binary_split":
+        split = tree_node.categorical_id_set_membership_binary_split
+        split_column = feature_names[split.feature_column + num_dense_floats +
+                                     num_sparse_float]
+      elif node_type == "leaf":
+        assert tree_node.node_metadata.gain == 0
+        continue
+      else:
+        raise ValueError("Unexpected split type %s", node_type)
+      # Apply shrinkage factor. It is important since it is not always uniform
+      # across different trees.
+      sums[split_column] += (
+          tree_node.node_metadata.gain * dtec.tree_weights[tree_idx])
+  return dict(sums)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
index 8d801fa1f38..4ed18b2d34c 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest
 
 class ConvertModelTest(test_util.TensorFlowTestCase):
 
-  def testConvertModel(self):
+  def _make_trees(self):
     dtec_str = """
     trees {
       nodes {
@@ -108,8 +108,12 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
     """
     dtec = tree_config_pb2.DecisionTreeEnsembleConfig()
     text_format.Merge(dtec_str, dtec)
-    # The feature columns in the order they were added.
     feature_columns = ["feature_b", "feature_a", "feature_d"]
+    return dtec, feature_columns
+
+  def testConvertModel(self):
+    dtec, feature_columns = self._make_trees()
+    # The feature columns in the order they were added.
     out = custom_export_strategy.convert_to_universal_format(
         dtec, feature_columns, 1, 1,
         1)
@@ -273,6 +277,16 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
     }"""
     self.assertProtoEquals(expected_tree, out)
 
+  def testFeatureImportance(self):
+    dtec, feature_columns = self._make_trees()
+    feature_importances = custom_export_strategy._get_feature_importances(
+        dtec, feature_columns, 1, 1, 1)
+    self.assertItemsEqual(["feature_b", "feature_a", "feature_d"],
+                          feature_importances.keys())
+    self.assertAlmostEqual(50.0, feature_importances["feature_b"], places=4)
+    self.assertAlmostEqual(50.0, feature_importances["feature_a"], places=4)
+    self.assertAlmostEqual(50.0, feature_importances["feature_d"], places=4)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index e28adad53ec..f8028acbdb0 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -61,11 +61,19 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       logits_modifier_function: A modifier function for the logits.
       center_bias: Whether a separate tree should be created for first fitting
         the bias.
+
+    Raises:
+      ValueError: If learner_config is not valid.
     """
     head = head_lib.multi_class_head(
         n_classes=n_classes,
         weight_column_name=weight_column_name,
         enable_centered_bias=False)
+    if learner_config.num_classes == 0:
+      learner_config.num_classes = n_classes
+    elif learner_config.num_classes != n_classes:
+      raise ValueError("n_classes (%d) doesn't match learner_config (%d)." %
+                       (learner_config.num_classes, n_classes))
     super(GradientBoostedDecisionTreeClassifier, self).__init__(
         model_fn=model.model_builder,
         params={
@@ -129,6 +137,10 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         label_dimension=label_dimension,
         weight_column_name=weight_column_name,
         enable_centered_bias=False)
+    if label_dimension == 1:
+      learner_config.num_classes = 2
+    else:
+      learner_config.num_classes = label_dimension
     super(GradientBoostedDecisionTreeRegressor, self).__init__(
         model_fn=model.model_builder,
         params={
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index 2d517f78111..8cda5c8f2b1 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -92,6 +92,7 @@ def model_builder(features, labels, mode, params, config):
       examples_per_layer=examples_per_layer,
       learner_config=learner_config,
       feature_columns=feature_columns,
+      logits_dimension=head.logits_dimension,
       features=features)
   with ops.name_scope("gbdt", "gbdt_optimizer"):
     predictions_dict = gbdt_model.predict(mode)
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
index 42112c586a5..f4ad99f779e 100644
--- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -74,7 +74,7 @@ class TreeEnsembleStampTokenOp : public OpKernel {
         decision_tree_ensemble_resource;
     OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
                                            &decision_tree_ensemble_resource));
-    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+    tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex());
     core::ScopedUnref unref_me(decision_tree_ensemble_resource);
     Tensor* output_stamp_token_t = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
@@ -95,7 +95,7 @@ class TreeEnsembleSerializeOp : public OpKernel {
         decision_tree_ensemble_resource;
     OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
                                            &decision_tree_ensemble_resource));
-    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+    tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex());
     core::ScopedUnref unref_me(decision_tree_ensemble_resource);
     Tensor* output_stamp_token_t = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
index daca0495481..8ffd7f120b4 100644
--- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -143,7 +143,7 @@ class GradientTreesPredictionOp : public OpKernel {
     // Release the reference to the resource once we're done using it.
     core::ScopedUnref unref_me(decision_tree_ensemble_resource);
     if (use_locking_) {
-      mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+      tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex());
       DoCompute(context, decision_tree_ensemble_resource);
     } else {
       DoCompute(context, decision_tree_ensemble_resource);
@@ -334,7 +334,7 @@ class GradientTreesPartitionExamplesOp : public OpKernel {
     // Release the reference to the resource once we're done using it.
     core::ScopedUnref unref_me(decision_tree_ensemble_resource);
     if (use_locking_) {
-      mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+      tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex());
       DoCompute(context, decision_tree_ensemble_resource);
     } else {
       DoCompute(context, decision_tree_ensemble_resource);
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index 9e9ef1738cd..d528757cf99 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -656,7 +656,8 @@ class GrowTreeEnsembleOp : public OpKernel {
     CHECK(split->split_info.split_node().node_case() != TreeNode::NODE_NOT_SET);
     CHECK(tree_config->nodes(node_id).node_case() == TreeNode::kLeaf)
         << "Unexpected node type to split "
-        << tree_config->nodes(node_id).node_case();
+        << tree_config->nodes(node_id).node_case() << " for node_id " << node_id
+        << ". Tree config: " << tree_config->DebugString();
 
     // Add left leaf.
     int32 left_id = tree_config->nodes_size();
@@ -767,7 +768,7 @@ class TreeEnsembleStatsOp : public OpKernel {
     OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
                                            &decision_tree_ensemble_resource));
     core::ScopedUnref unref_me(decision_tree_ensemble_resource);
-    mutex_lock l(*decision_tree_ensemble_resource->get_mutex());
+    tf_shared_lock l(*decision_tree_ensemble_resource->get_mutex());
 
     // Get the stamp token.
     const Tensor* stamp_token_t;
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
index 82664aed72d..f4c7df7fabd 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
@@ -42,6 +42,7 @@ class BiasFeatureColumnHandlerTest : public ::testing::Test {
         example_partitions_({0, 0, 1, 3}) {
     // Set L2 regularization.
     learner_config_.mutable_regularization()->set_l2(2.0f);
+    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
 
     // Create handler.
     handler_.reset(new BiasFeatureColumnHandler(kClassId, kSlotId, kBatchSize));
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
index abd72384648..ea82b3f086d 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
@@ -51,7 +51,7 @@ class CategoricalFeatureColumnHandlerTest : public ::testing::Test {
         values_(test::AsTensor<int64>({1, 2, 2, 0}, {4})) {
     // Set L2 regularization.
     learner_config_.mutable_regularization()->set_l2(2.0f);
-
+    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
     // Create handler.
     handler_.reset(new CategoricalFeatureColumnHandler(
         kClassId, kSlotId, kBatchSize, kFeatureColumn, indices_.matrix<int64>(),
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
index 396f48e5321..1bc9d733ad3 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
@@ -51,7 +51,7 @@ class DenseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
         dense_quantized_values_(test::AsTensor<int32>({1, 1, 0, 1}, {4})) {
     // Set L2 regularization.
     learner_config_.mutable_regularization()->set_l2(2.0f);
-
+    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
     // Create handler.
     handler_.reset(new DenseQuantizedFeatureColumnHandler(
         kClassId, kSlotId, kBatchSize, kFeatureColumn,
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
index db8c64a617f..643d936ad23 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
@@ -53,7 +53,7 @@ class SparseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
         sparse_quantized_values_(test::AsTensor<int32>({1, 0, 1}, {3})) {
     // Set L2 regularization.
     learner_config_.mutable_regularization()->set_l2(2.0f);
-
+    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
     // Create handler.
     handler_.reset(new SparseQuantizedFeatureColumnHandler(
         kClassId, kSlotId, kBatchSize, kFeatureColumn,
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
index f99b6826a78..ecb7a04efb9 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
@@ -30,6 +30,7 @@ const double kDelta = 1e-5;
 
 TEST(NodeStatsTest, AlmostZero) {
   LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
   NodeStats node_stats(learner_config, GradientStats(1e-8f, 1e-8f));
   EXPECT_EQ(0, node_stats.weight_contribution[0]);
   EXPECT_EQ(0, node_stats.gain);
@@ -37,6 +38,7 @@ TEST(NodeStatsTest, AlmostZero) {
 
 TEST(NodeStatsTest, LessThanMinWeightConstraint) {
   LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
   learner_config.mutable_constraints()->set_min_node_weight(3.2f);
   NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
   EXPECT_EQ(0, node_stats.weight_contribution[0]);
@@ -45,6 +47,7 @@ TEST(NodeStatsTest, LessThanMinWeightConstraint) {
 
 TEST(NodeStatsTest, L1RegSquashed) {
   LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
   learner_config.mutable_regularization()->set_l1(10.0f);
   NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
   EXPECT_EQ(0, node_stats.weight_contribution[0]);
@@ -53,6 +56,7 @@ TEST(NodeStatsTest, L1RegSquashed) {
 
 TEST(NodeStatsTest, L1RegPos) {
   LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
   learner_config.mutable_regularization()->set_l1(5.0f);
   NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
   const float expected_clipped_grad = 7.32f - 5.0f;
@@ -66,6 +70,7 @@ TEST(NodeStatsTest, L1RegPos) {
 
 TEST(NodeStatsTest, L1RegNeg) {
   LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
   learner_config.mutable_regularization()->set_l1(5.0f);
   NodeStats node_stats(learner_config, GradientStats(-7.32f, 1.63f));
   const float expected_clipped_grad = -7.32f + 5.0f;
@@ -79,6 +84,7 @@ TEST(NodeStatsTest, L1RegNeg) {
 
 TEST(NodeStatsTest, L2Reg) {
   LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
   learner_config.mutable_regularization()->set_l2(8.0f);
   NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
   const float expected_denom = 1.63f + 8.0f;
@@ -91,6 +97,7 @@ TEST(NodeStatsTest, L2Reg) {
 
 TEST(NodeStatsTest, L1L2Reg) {
   LearnerConfig learner_config;
+  learner_config.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
   learner_config.mutable_regularization()->set_l1(5.0f);
   learner_config.mutable_regularization()->set_l2(8.0f);
   NodeStats node_stats(learner_config, GradientStats(7.32f, 1.63f));
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index 314c44fddc5..dad3b4e10de 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -15,6 +15,7 @@
 #ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
 #define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
 
+#include <cstring>
 #include <vector>
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
@@ -34,10 +35,27 @@ class WeightedQuantilesSummary {
 
   struct SummaryEntry {
     SummaryEntry(const ValueType& v, const WeightType& w, const WeightType& min,
-                 const WeightType& max)
-        : value(v), weight(w), min_rank(min), max_rank(max) {}
+                 const WeightType& max) {
+      // Explicitely initialize all of memory (including padding from memory
+      // alignment) to allow the struct to be msan-resistant "plain old data".
+      //
+      // POD = http://en.cppreference.com/w/cpp/concept/PODType
+      memset(this, 0, sizeof(*this));
 
-    SummaryEntry() : value(0), weight(0), min_rank(0), max_rank(0) {}
+      value = v;
+      weight = w;
+      min_rank = min;
+      max_rank = max;
+    }
+
+    SummaryEntry() {
+      memset(this, 0, sizeof(*this));
+
+      value = 0;
+      weight = 0;
+      min_rank = 0;
+      max_rank = 0;
+    }
 
     bool operator==(const SummaryEntry& other) const {
       return value == other.value && weight == other.weight &&
diff --git a/tensorflow/contrib/boosted_trees/proto/learner.proto b/tensorflow/contrib/boosted_trees/proto/learner.proto
index 06ee223467b..919e7cd8142 100644
--- a/tensorflow/contrib/boosted_trees/proto/learner.proto
+++ b/tensorflow/contrib/boosted_trees/proto/learner.proto
@@ -17,7 +17,7 @@ message TreeRegularizationConfig {
 
 // Tree constraints config.
 message TreeConstraintsConfig {
-  // Maximum depth of the trees.
+  // Maximum depth of the trees. The default value is 6 if not specified.
   uint32 max_tree_depth = 1;
 
   // Min hessian weight per node.
@@ -86,20 +86,22 @@ message LearningRateDropoutDrivenConfig {
 
 message LearnerConfig {
   enum PruningMode {
-    PRE_PRUNE = 0;
-    POST_PRUNE = 1;
+    PRUNING_MODE_UNSPECIFIED = 0;
+    PRE_PRUNE = 1;
+    POST_PRUNE = 2;
   }
 
   enum GrowingMode {
-    WHOLE_TREE = 0;
-    // Layer by layer is only supported by the batch learner.
-    LAYER_BY_LAYER = 1;
+    GROWING_MODE_UNSPECIFIED = 0;
+    WHOLE_TREE = 1;
+    LAYER_BY_LAYER = 2;
   }
 
   enum MultiClassStrategy {
-    TREE_PER_CLASS = 0;
-    FULL_HESSIAN = 1;
-    DIAGONAL_HESSIAN = 2;
+    MULTI_CLASS_STRATEGY_UNSPECIFIED = 0;
+    TREE_PER_CLASS = 1;
+    FULL_HESSIAN = 2;
+    DIAGONAL_HESSIAN = 3;
   }
 
   // Number of classes.
@@ -118,16 +120,18 @@ message LearnerConfig {
   // Constraints.
   TreeConstraintsConfig constraints = 5;
 
-  // Pruning.
+  // Pruning. POST_PRUNE is the default pruning mode.
   PruningMode pruning_mode = 8;
 
-  // Growing Mode.
+  // Growing Mode. LAYER_BY_LAYER is the default growing mode.
   GrowingMode growing_mode = 9;
 
-  // Learning rate.
+  // Learning rate. By default we use fixed learning rate of 0.1.
   LearningRateConfig learning_rate_tuner = 6;
 
-  // Multi-class strategy.
+  // Multi-class strategy. By default we use TREE_PER_CLASS for binary
+  // classification and linear regression. For other cases, we use
+  // DIAGONAL_HESSIAN as the default.
   MultiClassStrategy multi_class_strategy = 10;
 
   // If you want to average the ensembles (for regularization), provide the
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
index 51e084b79c6..37595f1c75d 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -344,6 +344,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       # Prepare learner config.
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 2
+      learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
 
       result, result_no_dropout, dropout_info = (
           prediction_ops.gradient_trees_prediction(
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 6f85874a33a..2d28e0a9f16 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -261,6 +261,7 @@ class GradientBoostedDecisionTreeModel(object):
                examples_per_layer,
                learner_config,
                features,
+               logits_dimension,
                feature_columns=None):
     """Construct a new GradientBoostedDecisionTreeModel function.
 
@@ -273,8 +274,8 @@ class GradientBoostedDecisionTreeModel(object):
         a tree layer. It can also be a function that computes the number of
         examples based on the depth of the layer that's being built.
       learner_config: A learner config.
-          print split, sorted_feature_names[split.feature_column]
       features: `dict` of `Tensor` objects.
+      logits_dimension: An int, the dimension of logits.
       feature_columns: A list of feature columns.
 
     Raises:
@@ -289,11 +290,39 @@ class GradientBoostedDecisionTreeModel(object):
     if learner_config.num_classes < 2:
       raise ValueError("Number of classes must be >=2")
 
+    self._logits_dimension = logits_dimension
     self._is_chief = is_chief
     self._num_ps_replicas = num_ps_replicas
     self._ensemble_handle = ensemble_handle
     self._center_bias = center_bias
     self._examples_per_layer = examples_per_layer
+
+    # Fill in the defaults.
+    if (learner_config.multi_class_strategy ==
+        learner_pb2.LearnerConfig.MULTI_CLASS_STRATEGY_UNSPECIFIED):
+      if logits_dimension == 1:
+        learner_config.multi_class_strategy = (
+            learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      else:
+        learner_config.multi_class_strategy = (
+            learner_pb2.LearnerConfig.DIAGONAL_HESSIAN)
+
+    if (learner_config.growing_mode ==
+        learner_pb2.LearnerConfig.GROWING_MODE_UNSPECIFIED):
+      learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
+
+    if (learner_config.pruning_mode ==
+        learner_pb2.LearnerConfig.PRUNING_MODE_UNSPECIFIED):
+      learner_config.pruning_mode = learner_pb2.LearnerConfig.POST_PRUNE
+
+    if learner_config.constraints.max_tree_depth == 0:
+      # Use 6 as the default maximum depth.
+      learner_config.constraints.max_tree_depth = 6
+
+    tuner = learner_config.learning_rate_tuner.WhichOneof("tuner")
+    if not tuner:
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+
     self._learner_config = learner_config
     self._feature_columns = feature_columns
     self._learner_config_serialized = learner_config.SerializeToString()
@@ -378,75 +407,81 @@ class GradientBoostedDecisionTreeModel(object):
                              local_stamp), _refresh_local_ensemble_fn,
           lambda: (control_flow_ops.no_op(), ensemble_stamp))
 
-      # Once updated, Use the the local model for prediction.
+      # Once updated, use the local model for prediction.
       with ops.control_dependencies([refresh_local_ensemble]):
         ensemble_stats = training_ops.tree_ensemble_stats(
             local_ensemble_handle, ensemble_stamp)
-        apply_dropout, seed = _dropout_params(mode, ensemble_stats)
         # We don't need dropout info - we can always restore it based on the
         # seed.
-        predictions, predictions_no_dropout, _ = (
-            prediction_ops.gradient_trees_prediction(
-                local_ensemble_handle,
-                seed,
-                self._dense_floats,
-                self._sparse_float_indices,
-                self._sparse_float_values,
-                self._sparse_float_shapes,
-                self._sparse_int_indices,
-                self._sparse_int_values,
-                self._sparse_int_shapes,
-                learner_config=self._learner_config_serialized,
-                apply_dropout=apply_dropout,
-                apply_averaging=apply_averaging,
-                use_locking=False,
-                center_bias=self._center_bias,
-                reduce_dim=self._reduce_dim))
-        partition_ids = prediction_ops.gradient_trees_partition_examples(
-            local_ensemble_handle,
-            self._dense_floats,
-            self._sparse_float_indices,
-            self._sparse_float_values,
-            self._sparse_float_shapes,
-            self._sparse_int_indices,
-            self._sparse_int_values,
-            self._sparse_int_shapes,
-            use_locking=False)
+        apply_dropout, seed = _dropout_params(mode, ensemble_stats)
+        # Make sure ensemble stats run. This will check that the ensemble has
+        # the right stamp.
+        with ops.control_dependencies(ensemble_stats):
+          predictions, predictions_no_dropout, _ = (
+              prediction_ops.gradient_trees_prediction(
+                  local_ensemble_handle,
+                  seed,
+                  self._dense_floats,
+                  self._sparse_float_indices,
+                  self._sparse_float_values,
+                  self._sparse_float_shapes,
+                  self._sparse_int_indices,
+                  self._sparse_int_values,
+                  self._sparse_int_shapes,
+                  learner_config=self._learner_config_serialized,
+                  apply_dropout=apply_dropout,
+                  apply_averaging=apply_averaging,
+                  use_locking=True,
+                  center_bias=self._center_bias,
+                  reduce_dim=self._reduce_dim))
+          partition_ids = prediction_ops.gradient_trees_partition_examples(
+              local_ensemble_handle,
+              self._dense_floats,
+              self._sparse_float_indices,
+              self._sparse_float_values,
+              self._sparse_float_shapes,
+              self._sparse_int_indices,
+              self._sparse_int_values,
+              self._sparse_int_shapes,
+              use_locking=True)
 
     else:
       with ops.device(self._ensemble_handle.device):
         ensemble_stats = training_ops.tree_ensemble_stats(
             self._ensemble_handle, ensemble_stamp)
-        apply_dropout, seed = _dropout_params(mode, ensemble_stats)
         # We don't need dropout info - we can always restore it based on the
         # seed.
-        predictions, predictions_no_dropout, _ = (
-            prediction_ops.gradient_trees_prediction(
-                self._ensemble_handle,
-                seed,
-                self._dense_floats,
-                self._sparse_float_indices,
-                self._sparse_float_values,
-                self._sparse_float_shapes,
-                self._sparse_int_indices,
-                self._sparse_int_values,
-                self._sparse_int_shapes,
-                learner_config=self._learner_config_serialized,
-                apply_dropout=apply_dropout,
-                apply_averaging=apply_averaging,
-                use_locking=False,
-                center_bias=self._center_bias,
-                reduce_dim=self._reduce_dim))
-        partition_ids = prediction_ops.gradient_trees_partition_examples(
-            self._ensemble_handle,
-            self._dense_floats,
-            self._sparse_float_indices,
-            self._sparse_float_values,
-            self._sparse_float_shapes,
-            self._sparse_int_indices,
-            self._sparse_int_values,
-            self._sparse_int_shapes,
-            use_locking=False)
+        apply_dropout, seed = _dropout_params(mode, ensemble_stats)
+        # Make sure ensemble stats run. This will check that the ensemble has
+        # the right stamp.
+        with ops.control_dependencies(ensemble_stats):
+          predictions, predictions_no_dropout, _ = (
+              prediction_ops.gradient_trees_prediction(
+                  self._ensemble_handle,
+                  seed,
+                  self._dense_floats,
+                  self._sparse_float_indices,
+                  self._sparse_float_values,
+                  self._sparse_float_shapes,
+                  self._sparse_int_indices,
+                  self._sparse_int_values,
+                  self._sparse_int_shapes,
+                  learner_config=self._learner_config_serialized,
+                  apply_dropout=apply_dropout,
+                  apply_averaging=apply_averaging,
+                  use_locking=True,
+                  center_bias=self._center_bias,
+                  reduce_dim=self._reduce_dim))
+          partition_ids = prediction_ops.gradient_trees_partition_examples(
+              self._ensemble_handle,
+              self._dense_floats,
+              self._sparse_float_indices,
+              self._sparse_float_values,
+              self._sparse_float_shapes,
+              self._sparse_int_indices,
+              self._sparse_int_values,
+              self._sparse_int_shapes,
+              use_locking=True)
 
     return _make_predictions_dict(ensemble_stamp, predictions,
                                   predictions_no_dropout, partition_ids,
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 9ce434edf8b..16e24d97dde 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -164,7 +164,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=1, features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -268,7 +268,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=num_examples_fn,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=1, features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -371,7 +371,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=1, features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -442,7 +442,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=1, features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -505,7 +505,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=1, features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -588,7 +588,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=1, features=features)
 
       # Create predict op.
       mode = model_fn.ModeKeys.EVAL
@@ -627,7 +627,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=5, features=features)
 
       predictions = array_ops.constant(
           [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0],
@@ -730,7 +730,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=5, features=features)
 
       predictions = array_ops.constant(
           [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0],
@@ -833,7 +833,7 @@ class GbdtTest(test_util.TensorFlowTestCase):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          features=features)
+          logits_dimension=5, features=features)
 
       batch_size = 3
       predictions = array_ops.constant(
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 422df3063ee..c249a285562 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -33,6 +33,7 @@ option(tensorflow_BUILD_MORE_PYTHON_TESTS "Build more python unit tests for cont
 option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
+option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
@@ -204,6 +205,12 @@ if(tensorflow_ENABLE_JEMALLOC_SUPPORT)
   list(APPEND tensorflow_EXTERNAL_DEPENDENCIES jemalloc)
   include_directories(${jemalloc_INCLUDE_DIRS})
 endif()
+if(tensorflow_ENABLE_SNAPPY_SUPPORT)
+  include(snappy)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${snappy_STATIC_LIBRARIES})
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES snappy)
+  include_directories(${snappy_INCLUDE_DIR})
+endif()
 if(WIN32)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES wsock32 ws2_32 shlwapi)
 endif()
diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake
index 04a9664701c..dc27eadaca1 100644
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(boringssl_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/boringssl/src/boringssl/include)
 #set(boringssl_EXTRA_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/boringssl/src)
 set(boringssl_URL https://boringssl.googlesource.com/boringssl)
-set(boringssl_TAG 17cf2cb1d226b0ba2401304242df7ddd3b6f1ff2)
+set(boringssl_TAG ee7aa02)
 set(boringssl_BUILD ${CMAKE_BINARY_DIR}/boringssl/src/boringssl-build)
 #set(boringssl_LIBRARIES ${boringssl_BUILD}/obj/so/libboringssl.so)
 set(boringssl_STATIC_LIBRARIES
diff --git a/tensorflow/contrib/cmake/external/cub.cmake b/tensorflow/contrib/cmake/external/cub.cmake
index 477572d5881..d98579d2077 100644
--- a/tensorflow/contrib/cmake/external/cub.cmake
+++ b/tensorflow/contrib/cmake/external/cub.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(cub_URL http://mirror.bazel.build/github.com/NVlabs/cub/archive/69ceda618313df8e9cac6659d607b08949455d14.tar.gz)
-set(cub_HASH SHA256=87e856522c283b8ea887c3b61d7d5b252d2dd74abac4f1d756d776e721223e82)
+set(cub_URL http://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.3.zip)
+set(cub_HASH SHA256=b7ead9e291d34ffa8074243541c1380d63be63f88de23de8ee548db573b72ebe)
 set(cub_BUILD ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub)
 set(cub_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub/src/cub)
 set(cub_ARCHIVE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/cub_archive)
diff --git a/tensorflow/contrib/cmake/external/snappy.cmake b/tensorflow/contrib/cmake/external/snappy.cmake
new file mode 100644
index 00000000000..a35d8654fb6
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/snappy.cmake
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(snappy_URL https://github.com/google/snappy.git)
+set(snappy_TAG "55924d11095df25ab25c405fadfe93d0a46f82eb")
+set(snappy_BUILD ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy)
+set(snappy_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy)
+
+if(WIN32)
+    set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/$(Configuration)/snappy.lib)
+else()
+    set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/libsnappy.a)
+endif()
+
+set(snappy_HEADERS
+    "${snappy_INCLUDE_DIR}/snappy.h"
+)
+
+ExternalProject_Add(snappy
+    PREFIX snappy
+    GIT_REPOSITORY ${snappy_URL}
+    GIT_TAG ${snappy_TAG}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_IN_SOURCE 1
+    INSTALL_COMMAND ""
+    LOG_DOWNLOAD ON
+    LOG_CONFIGURE ON
+    LOG_BUILD ON
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DSNAPPY_BUILD_TESTS:BOOL=OFF
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
+
+# actually enables snappy in the source code
+add_definitions(-DSNAPPY)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 87d946c3462..c5a10181271 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -18,6 +18,7 @@
 set(tf_c_srcs
     "${tensorflow_source_dir}/tensorflow/c/c_api.cc"
     "${tensorflow_source_dir}/tensorflow/c/c_api.h"
+    "${tensorflow_source_dir}/tensorflow/c/c_api_function.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.h"
     "${tensorflow_source_dir}/tensorflow/c/eager/runtime.cc"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 48023099379..1b706159a3d 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -315,6 +315,7 @@ add_python_module("tensorflow/contrib/framework/ops")
 add_python_module("tensorflow/contrib/framework/python")
 add_python_module("tensorflow/contrib/framework/python/framework")
 add_python_module("tensorflow/contrib/framework/python/ops")
+add_python_module("tensorflow/contrib/gan")
 add_python_module("tensorflow/contrib/graph_editor")
 add_python_module("tensorflow/contrib/graph_editor/examples")
 add_python_module("tensorflow/contrib/graph_editor/tests")
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 76531add312..eb02f20457e 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -240,6 +240,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/training/quantize_training_test.py"  # Needs quantization ops to be included in windows.
       "${tensorflow_source_dir}/tensorflow/python/training/supervisor_test.py"  # Flaky I/O error on rename.
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"  # Needs portpicker.
+      "${tensorflow_source_dir}/tensorflow/python/training/server_lib_test.py"  # Test occasionally deadlocks.
+
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"  # depends on python/framework/test_ops
       # Broken tensorboard test due to cmake issues.
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"
@@ -291,6 +293,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Failing with TF 1.3 (TODO)
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py"
+      # Test should only be run manually
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reduction_ops_test_big.py"
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 694bd507d97..bc4fd10cac6 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -716,6 +716,482 @@ _cudnn_rnn_common_doc_string = """
 """
 
 
+def _check_direction(direction):
+  if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
+    raise ValueError("Invalid direction: %s, expect %s or %s" %
+                     (direction, CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION))
+
+
+def _check_rnn_mode(rnn_mode):
+  if rnn_mode not in (CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, CUDNN_RNN_RELU):
+    raise ValueError("Invalid rnn_mode: %s, expect one of (%s, %s, %s, %s)" %
+                     (rnn_mode, CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH,
+                      CUDNN_RNN_RELU))
+
+
+def _get_seed(seed):
+  seed, seed2 = random_seed.get_seed(seed)
+  if seed is None and seed2 is None:
+    seed, seed2 = 0, 0
+  return seed, seed2
+
+
+def _get_num_params(rnn_mode, num_layers, direction):
+  """Return num params for given Cudnn config."""
+  if rnn_mode == CUDNN_LSTM:
+    num_params_per_layer = 8
+  elif rnn_mode == CUDNN_GRU:
+    num_params_per_layer = 6
+  elif rnn_mode in (CUDNN_RNN_RELU, CUDNN_RNN_TANH):
+    num_params_per_layer = 2
+  else:
+    raise ValueError("Invalid \'rnn_mode\': %s", rnn_mode)
+  num_params = num_layers * num_params_per_layer
+  if direction != CUDNN_RNN_UNIDIRECTION:
+    num_params *= 2
+  return num_params
+
+
+def _cudnn_rnn(inputs,
+               input_h,
+               input_c,
+               params,
+               is_training,
+               rnn_mode,
+               input_mode=CUDNN_INPUT_LINEAR_MODE,
+               direction=CUDNN_RNN_UNIDIRECTION,
+               dropout=0.,
+               seed=0,
+               name=None):
+  """Cudnn RNN.
+
+  Args:
+    inputs: the input sequence to the RNN model. A Tensor of shape [?,
+      batch_size, input_size].
+    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+      batch_size, num_units].
+    input_c: the initial hidden state for c. This is only relevant for LSTM.
+      A Tensor of the same shape as input_h.
+    params: the parameter buffer created for this model.
+    is_training: whether this operation will be used in training or inference
+    rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh').
+    input_mode: indicate whether there is a linear projection between the
+      input and the actual computation before the first layer. It could be
+      'linear_input', 'skip_input' or 'auto_select'.
+      'linear_input' (default) always applies a linear projection of input
+      onto RNN hidden state. (standard RNN behavior).
+      'skip_input' is only allowed when input_size == num_units;
+      'auto_select' implies 'skip_input' when input_size == num_units;
+      otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    outputs, output_h, output_c
+  """
+  _check_rnn_mode(rnn_mode)
+  _check_direction(direction)
+  seed, seed2 = random_seed.get_seed(seed)
+  outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      input=inputs,
+      input_h=input_h,
+      input_c=input_c,
+      params=params,
+      is_training=is_training,
+      rnn_mode=rnn_mode,
+      input_mode=input_mode,
+      direction=direction,
+      dropout=dropout,
+      seed=seed,
+      seed2=seed2,
+      name=name)
+  return (outputs, output_h, output_c)
+
+
+def cudnn_lstm(inputs,
+               input_h,
+               input_c,
+               params,
+               is_training,
+               input_mode=CUDNN_INPUT_LINEAR_MODE,
+               direction=CUDNN_RNN_UNIDIRECTION,
+               dropout=0.,
+               seed=0,
+               name=None):
+  """Cudnn LSTM.
+
+  Args:
+    inputs: the input sequence to the RNN model. A Tensor of shape [?,
+      batch_size, input_size].
+    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+      batch_size, num_units].
+    input_c: the initial hidden state for c. This is only relevant for LSTM.
+      A Tensor of the same shape as input_h.
+    params: the parameter buffer created for this model.
+    is_training: whether this operation will be used in training or inference
+      input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'.
+        'linear_input' (default) always applies a linear projection of input
+        onto RNN hidden state. (standard RNN behavior).
+        'skip_input' is only allowed when input_size == num_units;
+        'auto_select' implies 'skip_input' when input_size == num_units;
+        otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    outputs, output_h, output_c
+  """
+  return _cudnn_rnn(inputs, input_h, input_c, params, is_training, CUDNN_LSTM,
+                    input_mode, direction, dropout, seed, name)
+
+
+def _cudnn_rnn_no_input_c(inputs,
+                          input_h,
+                          params,
+                          is_training,
+                          rnn_mode,
+                          input_mode=CUDNN_INPUT_LINEAR_MODE,
+                          direction=CUDNN_RNN_UNIDIRECTION,
+                          dropout=0.,
+                          seed=0,
+                          name=None):
+  """Cudnn RNN w/o input_c.
+
+  Args:
+    inputs: the input sequence to the RNN model. A Tensor of shape [?,
+      batch_size, input_size].
+    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+      batch_size, num_units].
+    params: the parameter buffer created for this model.
+    is_training: whether this operation will be used in training or inference
+    rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh').
+    input_mode: indicate whether there is a linear projection between the
+      input and the actual computation before the first layer. It could be
+      'linear_input', 'skip_input' or 'auto_select'.
+      'linear_input' (default) always applies a linear projection of input
+      onto RNN hidden state. (standard RNN behavior).
+      'skip_input' is only allowed when input_size == num_units;
+      'auto_select' implies 'skip_input' when input_size == num_units;
+      otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    outputs, output_h
+  """
+  input_c = array_ops.constant([], dtype=input_h.dtype)
+  outputs, output_h, _ = _cudnn_rnn(inputs, input_h, input_c, params,
+                                    is_training, rnn_mode, input_mode,
+                                    direction, dropout, seed, name)
+  return outputs, output_h
+
+
+def cudnn_gru(inputs,
+              input_h,
+              params,
+              is_training,
+              input_mode=CUDNN_INPUT_LINEAR_MODE,
+              direction=CUDNN_RNN_UNIDIRECTION,
+              dropout=0.,
+              seed=0,
+              name=None):
+  """Cudnn GRU.
+
+  Args:
+    inputs: the input sequence to the RNN model. A Tensor of shape [?,
+      batch_size, input_size].
+    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+      batch_size, num_units].
+    params: the parameter buffer created for this model.
+    is_training: whether this operation will be used in training or inference
+      input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'.
+        'linear_input' (default) always applies a linear projection of input
+        onto RNN hidden state. (standard RNN behavior).
+        'skip_input' is only allowed when input_size == num_units;
+        'auto_select' implies 'skip_input' when input_size == num_units;
+        otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    outputs, output_h
+  """
+  return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, CUDNN_GRU,
+                               input_mode, direction, dropout, seed, name)
+
+
+def cudnn_rnn_relu(inputs,
+                   input_h,
+                   params,
+                   is_training,
+                   input_mode=CUDNN_INPUT_LINEAR_MODE,
+                   direction=CUDNN_RNN_UNIDIRECTION,
+                   dropout=0.,
+                   seed=0,
+                   name=None):
+  """Cudnn RNN Relu.
+
+  Args:
+    inputs: the input sequence to the RNN model. A Tensor of shape [?,
+      batch_size, input_size].
+    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+      batch_size, num_units].
+    params: the parameter buffer created for this model.
+    is_training: whether this operation will be used in training or inference
+      input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'.
+        'linear_input' (default) always applies a linear projection of input
+        onto RNN hidden state. (standard RNN behavior).
+        'skip_input' is only allowed when input_size == num_units;
+        'auto_select' implies 'skip_input' when input_size == num_units;
+        otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    outputs, output_h
+  """
+  return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
+                               CUDNN_RNN_RELU, input_mode, direction, dropout,
+                               seed, name)
+
+
+def cudnn_rnn_tanh(inputs,
+                   input_h,
+                   params,
+                   is_training,
+                   input_mode=CUDNN_INPUT_LINEAR_MODE,
+                   direction=CUDNN_RNN_UNIDIRECTION,
+                   dropout=0.,
+                   seed=0,
+                   name=None):
+  """Cudnn RNN Tanh.
+
+  Args:
+    inputs: the input sequence to the RNN model. A Tensor of shape [?,
+      batch_size, input_size].
+    input_h: the initial hidden state for h. A Tensor of shape [num_layers,
+      batch_size, num_units].
+    params: the parameter buffer created for this model.
+    is_training: whether this operation will be used in training or inference
+      input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'.
+        'linear_input' (default) always applies a linear projection of input
+        onto RNN hidden state. (standard RNN behavior).
+        'skip_input' is only allowed when input_size == num_units;
+        'auto_select' implies 'skip_input' when input_size == num_units;
+        otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    outputs, output_h
+  """
+  return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
+                               CUDNN_RNN_TANH, input_mode, direction, dropout,
+                               seed, name)
+
+
+def cudnn_rnn_params_to_canonical(rnn_mode,
+                                  num_layers,
+                                  num_units,
+                                  input_size,
+                                  params,
+                                  input_mode=CUDNN_INPUT_LINEAR_MODE,
+                                  direction=CUDNN_RNN_UNIDIRECTION,
+                                  dropout=0,
+                                  seed=0,
+                                  name=None):
+  """Convert cudnn opaque params to canonical.
+
+  Args:
+    rnn_mode: a string specifies the mode, under which this RNN model runs.
+        Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'.
+    num_layers: the number of layers for the RNN model.
+    num_units: the number of units within the RNN model.
+    input_size: the size of the input, it could be different from the
+        num_units.
+    params: opaque cudnn params var.
+    input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'.
+        'linear_input' (default) always applies a linear projection of input
+        onto RNN hidden state. (standard RNN behavior).
+        'skip_input' is only allowed when input_size == num_units;
+        'auto_select' implies 'skip_input' when input_size == num_units;
+        otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    weights list and bias list
+  Raises:
+    ValueError: if rnn_mode or direction is invalid.
+  """
+
+  _check_rnn_mode(rnn_mode)
+  _check_direction(direction)
+  num_params = _get_num_params(rnn_mode, num_layers, direction)
+  seed, seed2 = random_seed.get_seed(seed)
+  weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
+      rnn_mode=rnn_mode,
+      num_layers=num_layers,
+      num_units=num_units,
+      input_size=input_size,
+      params=params,
+      input_mode=input_mode,
+      direction=direction,
+      dropout=dropout,
+      seed=seed,
+      seed2=seed2,
+      num_params=num_params,
+      name=name)
+  return weights, biases
+
+
+def cudnn_rnn_canonical_to_params(rnn_mode,
+                                  num_layers,
+                                  num_units,
+                                  input_size,
+                                  weights,
+                                  biases,
+                                  input_mode=CUDNN_INPUT_LINEAR_MODE,
+                                  direction=CUDNN_RNN_UNIDIRECTION,
+                                  dropout=0,
+                                  seed=0,
+                                  name=None):
+  """Converts params from the canonical format to a specific format of cuDNN.
+
+  Args:
+    rnn_mode: a string specifies the mode, under which this RNN model runs.
+        Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'.
+    num_layers: the number of layers for the RNN model.
+    num_units: the number of units within the RNN model.
+    input_size: the size of the input, it could be different from the
+        num_units.
+    weights: a Tensor for weight parameters.
+    biases: a Tensor for bias parameters.
+    input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'.
+        'linear_input' (default) always applies a linear projection of input
+        onto RNN hidden state. (standard RNN behavior).
+        'skip_input' is only allowed when input_size == num_units;
+        'auto_select' implies 'skip_input' when input_size == num_units;
+        otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    an opaque Cudnn param.
+  Raises:
+    ValueError: if rnn_mode or direction is invalid.
+  """
+  _check_rnn_mode(rnn_mode)
+  _check_direction(direction)
+  seed, seed2 = random_seed.get_seed(seed)
+  return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
+      rnn_mode=rnn_mode,
+      num_layers=num_layers,
+      num_units=num_units,
+      input_size=input_size,
+      weights=weights,
+      biases=biases,
+      input_mode=input_mode,
+      direction=direction,
+      dropout=dropout,
+      seed=seed,
+      seed2=seed2,
+      name=name)
+
+
+def cudnn_opaque_params_size(rnn_mode,
+                             num_layers,
+                             num_units,
+                             input_size,
+                             input_mode=CUDNN_INPUT_LINEAR_MODE,
+                             direction=CUDNN_RNN_UNIDIRECTION,
+                             dtype=dtypes.float32,
+                             dropout=0,
+                             seed=0,
+                             name=None):
+  """Returns opaque params size for specific Cudnn config.
+
+  Args:
+    rnn_mode: a string specifies the mode, under which this RNN model runs.
+        Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'.
+    num_layers: the number of layers for the RNN model.
+    num_units: the number of units within the RNN model.
+    input_size: the size of the input, it could be different from the
+        num_units.
+    input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'.
+        'linear_input' (default) always applies a linear projection of input
+        onto RNN hidden state. (standard RNN behavior).
+        'skip_input' is only allowed when input_size == num_units;
+        'auto_select' implies 'skip_input' when input_size == num_units;
+        otherwise, it implies 'linear_input'.
+    direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+    dtype: one of tf.float32 or tf.float64.
+    dropout: whether to enable dropout. With it is 0, dropout is disabled.
+    seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+        for behavior.
+    name: name of the operation.
+  Returns:
+    a int, size of Cudnn opaque params.
+  Raises:
+    ValueError: if rnn_mode or direction is invalid.
+  """
+  _check_rnn_mode(rnn_mode)
+  _check_direction(direction)
+  seed, seed2 = random_seed.get_seed(seed)
+  return gen_cudnn_rnn_ops.cudnn_rnn_params_size(
+      rnn_mode=rnn_mode,
+      num_layers=num_layers,
+      num_units=num_units,
+      input_size=input_size,
+      T=dtype,
+      S=dtypes.int32,
+      dropout=dropout,
+      seed=seed,
+      seed2=seed2,
+      input_mode=input_mode,
+      direction=direction,
+      name=name)[0]
+
+
 class _CudnnRNN(object):
   """Creates an RNN model using the underlying Cudnn implementation.
 
@@ -761,9 +1237,6 @@ class _CudnnRNN(object):
     Raises:
       ValueError: if direction is invalid.
     """
-    if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
-      raise ValueError("Invalid direction: %s, expect %s or %s",
-                       direction, CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION)
     self._num_layers = num_layers
     self._num_units = num_units
     self._input_size = input_size
@@ -772,10 +1245,7 @@ class _CudnnRNN(object):
     self._direction = direction
     self._dtype = dtype
     self._dropout = dropout
-    # get graph and op seed.
-    self._seed, self._seed2 = random_seed.get_seed(seed)
-    if self._seed is None and self._seed2 is None:
-      self._seed, self._seed2 = 0, 0
+    self._seed = seed
 
   @property
   def input_mode(self):
@@ -807,18 +1277,16 @@ class _CudnnRNN(object):
     Returns:
       The calculated parameter buffer size.
     """
-    return gen_cudnn_rnn_ops.cudnn_rnn_params_size(
+    return cudnn_opaque_params_size(
+        rnn_mode=self._rnn_mode,
         num_layers=self._num_layers,
         num_units=self._num_units,
         input_size=self._input_size,
-        T=self._dtype,
-        S=dtypes.int32,
+        dtype=self._dtype,
         dropout=self._dropout,
         seed=self._seed,
-        seed2=self._seed2,
-        rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
-        direction=self._direction)[0]
+        direction=self._direction)
 
   def __call__(self, input_data, input_h, input_c, params, is_training=True):
     """Runs the forward step for the RNN model.
@@ -837,22 +1305,17 @@ class _CudnnRNN(object):
       output_h: the final state for h.
       output_c: the final state for c. This is only relevant for LSTM.
     """
-    if self._rnn_mode != CUDNN_LSTM:
-      # For model that doesn't take input_c, replace with a dummy tensor.
-      input_c = array_ops.constant([], dtype=self._dtype)
-    output, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
-        input=input_data,
-        input_h=input_h,
-        input_c=input_c,
-        params=params,
-        rnn_mode=self._rnn_mode,
+    return _cudnn_rnn(
+        input_data,
+        input_h,
+        input_c,
+        params,
+        is_training,
+        self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
-        seed=self._seed,
-        seed2=self._seed2,
-        is_training=is_training)
-    return (output, output_h, output_c)
+        seed=self._seed)
 
   def params_to_canonical(self, params):
     """Converts params from a specific format of cuDNN to the canonical format.
@@ -863,22 +1326,16 @@ class _CudnnRNN(object):
     Returns:
       A function for the specific-to-canonical conversion.
     """
-    num_params = self._num_layers * self._NUM_PARAMS_PER_LAYER
-    if self._direction != CUDNN_RNN_UNIDIRECTION:
-      num_params *= 2
-    weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
+    return cudnn_rnn_params_to_canonical(
+        rnn_mode=self._rnn_mode,
         num_layers=self._num_layers,
         num_units=self._num_units,
         input_size=self._input_size,
         params=params,
-        dropout=self._dropout,
-        seed=self._seed,
-        seed2=self._seed2,
-        num_params=num_params,
-        rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
-        direction=self._direction)
-    return weights, biases
+        direction=self._direction,
+        dropout=self._dropout,
+        seed=self._seed)
 
   def canonical_to_params(self, weights, biases):
     """Converts params from the canonical format to a specific format of cuDNN.
@@ -890,18 +1347,17 @@ class _CudnnRNN(object):
     Returns:
       A function for the canonical-to-params-to-specific conversion..
     """
-    return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
+    return cudnn_rnn_canonical_to_params(
+        rnn_mode=self._rnn_mode,
         num_layers=self._num_layers,
         num_units=self._num_units,
         input_size=self._input_size,
         weights=weights,
         biases=biases,
-        dropout=self._dropout,
-        seed=self._seed,
-        seed2=self._seed2,
-        rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
-        direction=self._direction)
+        direction=self._direction,
+        dropout=self._dropout,
+        seed=self._seed)
 
 
 class CudnnLSTM(_CudnnRNN):
@@ -1036,9 +1492,16 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       output: the output sequuence.
       output_h: the final state for h.
     """
-    output, output_h, _ = super(_CudnnRNNNoInputC, self).__call__(
-        input_data, input_h, None, params, is_training=is_training)
-    return (output, output_h)
+    return _cudnn_rnn_no_input_c(
+        input_data,
+        input_h,
+        params,
+        is_training,
+        self._rnn_mode,
+        input_mode=self._input_mode,
+        direction=self._direction,
+        dropout=self._dropout,
+        seed=self._seed)
 
 
 class CudnnGRU(_CudnnRNNNoInputC):
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 5308ab64ace..1c0a5288f7e 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -22,6 +22,7 @@
 
 @@read_batch_features
 @@rejection_resample
+@@group_by_window
 """
 
 from __future__ import absolute_import
@@ -31,6 +32,7 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
 from tensorflow.contrib.data.python.ops.dataset_ops import FixedLengthRecordDataset
+from tensorflow.contrib.data.python.ops.dataset_ops import group_by_window
 from tensorflow.contrib.data.python.ops.dataset_ops import Iterator
 from tensorflow.contrib.data.python.ops.dataset_ops import read_batch_features
 from tensorflow.contrib.data.python.ops.dataset_ops import rejection_resample
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 71df1ee0a50..0111aae1035 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -37,7 +37,9 @@ class GroupByWindowTest(test.TestCase):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
     iterator = dataset_ops.Iterator.from_dataset(
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
-        .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+        .apply(
+            dataset_ops.group_by_window,
+            args=(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -61,8 +63,9 @@ class GroupByWindowTest(test.TestCase):
     components = np.array(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
     iterator = dataset_ops.Iterator.from_dataset(
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1)
-        .group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
+            dataset_ops.group_by_window,
+            args=(lambda x: x % 3, lambda _, xs: xs.batch(4), 4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -81,8 +84,9 @@ class GroupByWindowTest(test.TestCase):
   def testSmallGroups(self):
     components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
     iterator = dataset_ops.Iterator.from_dataset(
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            dataset_ops.group_by_window,
+            args=(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -108,8 +112,9 @@ class GroupByWindowTest(test.TestCase):
 
     iterator = dataset_ops.Iterator.from_dataset(
         dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: (x, ops.convert_to_tensor([x * x])))
-        .group_by_window(lambda x, _: x % 2, reduce_func, 32))
+        .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
+            dataset_ops.group_by_window,
+            args=(lambda x, _: x % 2, reduce_func, 32)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -124,17 +129,20 @@ class GroupByWindowTest(test.TestCase):
     def reduce_func(key, window):
       # Apply two different kinds of padding to the input: tight
       # padding, and quantized (to a multiple of 10) padding.
-      return dataset_ops.Dataset.zip((window.padded_batch(
-          4,
-          padded_shapes=tensor_shape.TensorShape([None])), window.padded_batch(
+      return dataset_ops.Dataset.zip((
+          window.padded_batch(
+              4, padded_shapes=tensor_shape.TensorShape([None])),
+          window.padded_batch(
               4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),))
 
     iterator = dataset_ops.Iterator.from_dataset(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
-        .group_by_window(
-            lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
-            reduce_func, 4))
+        .apply(
+            dataset_ops.group_by_window,
+            args=
+            (lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
+             reduce_func, 4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -151,10 +159,9 @@ class GroupByWindowTest(test.TestCase):
       self.assertEqual(len(components), sum(counts))
 
 
-# NOTE(mrry): These tests are based on the tests in
-# bucket_ops_test.py. Currently, different batch sizes for each key
-# are not supported, although this would be possible to add to
-# `Dataset.group_by_window()`.
+# NOTE(mrry): These tests are based on the tests in bucket_ops_test.py.
+# Currently, they use a constant batch size, though should be made to use a
+# different batch size per key.
 class BucketTest(test.TestCase):
 
   def _dynamicPad(self, bucket, window, window_size):
@@ -168,6 +175,7 @@ class BucketTest(test.TestCase):
                  tensor_shape.TensorShape([3])))))
 
   def testSingleBucket(self):
+
     def _map_fn(v):
       return (v, array_ops.fill([v], v),
               array_ops.fill([3], string_ops.as_string(v)))
@@ -175,9 +183,10 @@ class BucketTest(test.TestCase):
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))
 
-    bucketed_dataset = input_dataset.group_by_window(
-        lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32),
-        32)
+    bucketed_dataset = input_dataset.apply(
+        dataset_ops.group_by_window,
+        args=(lambda x, y, z: 0,
+              lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
     iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
     init_op = iterator.initializer
@@ -201,6 +210,7 @@ class BucketTest(test.TestCase):
       self.assertAllEqual(expected_vec3_str, bucketed_values[2])
 
   def testEvenOddBuckets(self):
+
     def _map_fn(v):
       return (v, array_ops.fill([v], v),
               array_ops.fill([3], string_ops.as_string(v)))
@@ -208,9 +218,10 @@ class BucketTest(test.TestCase):
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))
 
-    bucketed_dataset = input_dataset.group_by_window(
-        lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
-        lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)
+    bucketed_dataset = input_dataset.apply(
+        dataset_ops.group_by_window,
+        args=(lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
+              lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
     iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
     init_op = iterator.initializer
@@ -256,25 +267,31 @@ class BucketTest(test.TestCase):
       self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
 
   def testEvenOddBucketsFilterOutAllOdd(self):
+
     def _map_fn(v):
-      return {"x": v,
-              "y": array_ops.fill([v], v),
-              "z": array_ops.fill([3], string_ops.as_string(v))}
+      return {
+          "x": v,
+          "y": array_ops.fill([v], v),
+          "z": array_ops.fill([3], string_ops.as_string(v))
+      }
 
     def _dynamic_pad_fn(bucket, window, _):
       return dataset_ops.Dataset.zip(
           (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-              32, {"x": tensor_shape.TensorShape([]),
-                   "y": tensor_shape.TensorShape([None]),
-                   "z": tensor_shape.TensorShape([3])})))
+              32, {
+                  "x": tensor_shape.TensorShape([]),
+                  "y": tensor_shape.TensorShape([None]),
+                  "z": tensor_shape.TensorShape([3])
+              })))
 
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
         .filter(lambda d: math_ops.equal(d["x"] % 2, 0)))
 
-    bucketed_dataset = input_dataset.group_by_window(
-        lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
-        lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32)
+    bucketed_dataset = input_dataset.apply(
+        dataset_ops.group_by_window,
+        args=(lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
+              lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
 
     iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
     init_op = iterator.initializer
@@ -295,6 +312,40 @@ class BucketTest(test.TestCase):
       self.assertAllEqual(
           np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 
+  def testDynamicWindowSize(self):
+    components = np.arange(100).astype(np.int64)
+
+    # Key fn: even/odd
+    # Reduce fn: batches of 5
+    # Window size fn: even=5, odd=10
+
+    def window_size_func(key):
+      window_sizes = constant_op.constant([5, 10], dtype=dtypes.int64)
+      return window_sizes[key]
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
+        dataset_ops.group_by_window,
+        args=(lambda x: x % 2, lambda _, xs: xs.batch(20), None,
+              window_size_func))
+    iterator = dataset_ops.Iterator.from_dataset(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.OutOfRangeError):
+        batches = 0
+        while True:
+          result = sess.run(get_next)
+          is_even = all(x % 2 == 0 for x in result)
+          is_odd = all(x % 2 == 1 for x in result)
+          self.assertTrue(is_even or is_odd)
+          expected_batch_size = 5 if is_even else 10
+          self.assertEqual(expected_batch_size, result.shape[0])
+          batches += 1
+
+      self.assertEqual(batches, 15)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index 97b4ec44fc1..d05fbb7d285 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import os
 import threading
+from collections import namedtuple
 
 import numpy as np
 
@@ -481,6 +482,40 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testMapNamedtuple(self, count=10):
+    # construct dataset of tuples
+    labels = dataset_ops.Dataset.range(count)
+    images = labels.map(lambda l: -l)
+    dataset_tuple = dataset_ops.Dataset.zip((labels, images))
+
+    # convert dataset of tuples to dataset of namedtuples
+    Example = namedtuple("Example", ["label", "image"])
+    dataset_namedtuple = dataset_tuple.map(Example)
+
+    def preprocess_tuple(label, image):
+      image = 2 * image
+      return label, image
+
+    def preprocess_namedtuple(example):
+      return example._replace(image=2 * example.image)
+
+    # preprocess both datasets
+    dataset_tuple = dataset_tuple.map(preprocess_tuple)
+    dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
+
+    next_tuple = dataset_tuple.make_one_shot_iterator().get_next()
+    next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next()
+
+    # make sure both datasets contain the same data
+    with self.test_session() as sess:
+      for i in range(count):
+        tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple])
+        self.assertEqual(tuple_, namedtuple_)
+        self.assertEqual(tuple_, (i, -2 * i))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_namedtuple)
+
   def testUseStepContainerInMap(self):
     row = np.arange(6)
     iterator = (
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 46af2b19494..0ee9acfc97f 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -1199,28 +1199,9 @@ class Dataset(object):
     return DenseToSparseBatchDataset(self, batch_size, row_shape)
 
   def group_by_window(self, key_func, reduce_func, window_size):
-    """Performs a windowed "group-by" operation on this dataset.
-
-    This method maps each consecutive element in this dataset to a key
-    using `key_func` and groups the elements by key. It then applies
-    `reduce_func` to at most `window_size` elements matching the same
-    key. All execpt the final window for each key will contain
-    `window_size` elements; the final window may be smaller.
-
-    Args:
-      key_func: A function mapping a nested structure of tensors
-        (having shapes and types defined by `self.output_shapes` and
-        `self.output_types`) to a scalar `tf.int64` tensor.
-      reduce_func: A function mapping a key and a dataset of up to `batch_size`
-        consecutive elements matching that key to another dataset.
-      window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        consecutive elements matching the same key to combine in a single
-        batch, which will be passed to `reduce_func`.
-
-    Returns:
-      A `Dataset`.
-    """
-    return GroupByWindowDataset(self, key_func, reduce_func, window_size)
+    """See group_by_window()."""
+    return self.apply(
+        group_by_window, args=(key_func, reduce_func, window_size))
 
   def map(self,
           map_func,
@@ -1370,6 +1351,43 @@ class Dataset(object):
     """
     return FilterDataset(self, predicate)
 
+  def apply(self, fn, args=(), kwargs={}):  # pylint: disable=dangerous-default-value
+    """Apply a function to this dataset.
+
+    `apply` enables chaining of custom `Dataset` transformations.
+
+    For example:
+
+    ```
+    dataset.map(
+        lambda x: x**2
+    ).apply(
+        group_by_window, args=(key_func, reduce_func, window_size)
+    ).map(
+        lambda x: x**3
+    )
+    ```
+
+    Args:
+      fn: A function that takes a `Dataset`, `args`, and `kwargs`, and
+        returns a `Dataset`.
+      args: A `tuple` or `list` of arguments to be passed to `fn`.
+      kwargs: A `dict` of keyword arguments to be passed to `fn`.
+
+    Returns:
+      The `Dataset` returned by `fn`.
+    """
+    if not (isinstance(args, tuple) or isinstance(args, list)):
+      raise TypeError("args must be a tuple or list.")
+    if not isinstance(kwargs, dict):
+      raise TypeError("kwargs must be a dict.")
+
+    dataset = fn(self, *args, **kwargs)
+
+    if not isinstance(dataset, Dataset):
+      raise TypeError("fn must return a Dataset.")
+    return dataset
+
 
 class TensorDataset(Dataset):
   """A `Dataset` with a single element, viz. a nested structure of tensors."""
@@ -1903,7 +1921,7 @@ class DenseToSparseBatchDataset(Dataset):
 
 def _should_unpack_args(args):
   """Returns `True` if `args` should be `*args` when passed to a callable."""
-  return nest.is_sequence(args) and not isinstance(args, dict)
+  return type(args) is tuple  # pylint: disable=unidiomatic-typecheck
 
 
 class _ResourceDataset(Dataset):
@@ -1927,71 +1945,6 @@ class _ResourceDataset(Dataset):
     return self._output_types
 
 
-class GroupByWindowDataset(Dataset):
-  """A `Dataset` that groups its input and performs a windowed reduction."""
-
-  def __init__(self, input_dataset, key_func, reduce_func, window_size):
-    """See `Dataset.group_by_window()` for details."""
-    super(GroupByWindowDataset, self).__init__()
-    self._input_dataset = input_dataset
-    self._window_size = window_size
-
-    @function.Defun(*nest.flatten(input_dataset.output_types))
-    def tf_key_func(*args):
-      """A wrapper for Defun that facilitates shape inference."""
-      # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
-        arg.set_shape(shape)
-      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-      if _should_unpack_args(nested_args):
-        ret = key_func(*nested_args)
-      else:
-        ret = key_func(nested_args)
-      ret = ops.convert_to_tensor(ret, dtype=dtypes.int64)
-      if ret.dtype != dtypes.int64:
-        raise ValueError("`key_func` must return a single tf.int64 tensor.")
-      return ret
-
-    self._key_func = tf_key_func
-    self._key_func.add_to_graph(ops.get_default_graph())
-
-    @function.Defun(dtypes.int64, dtypes.resource)
-    def tf_reduce_func(key, window_dataset_resource):
-      """A wrapper for Defun that facilitates shape inference."""
-      key.set_shape([])
-      window_dataset = _ResourceDataset(window_dataset_resource,
-                                        input_dataset.output_types,
-                                        input_dataset.output_shapes)
-      output_dataset = reduce_func(key, window_dataset)
-      if not isinstance(output_dataset, Dataset):
-        raise TypeError("`reduce_func` must return a `Dataset` object.")
-      self._output_types = output_dataset.output_types
-      self._output_shapes = output_dataset.output_shapes
-      return output_dataset.make_dataset_resource()
-
-    self._reduce_func = tf_reduce_func
-    self._reduce_func.add_to_graph(ops.get_default_graph())
-
-  def make_dataset_resource(self):
-    return gen_dataset_ops.group_by_window_dataset(
-        self._input_dataset.make_dataset_resource(),
-        self._key_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._window_size,
-        key_func=self._key_func,
-        reduce_func=self._reduce_func,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-
 class MapDataset(Dataset):
   """A `Dataset` that maps a function over elements in its input."""
 
@@ -2151,7 +2104,7 @@ class InterleaveDataset(Dataset):
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
 
-      if nest.is_sequence(nested_args):
+      if _should_unpack_args(nested_args):
         dataset = map_func(*nested_args)
       else:
         dataset = map_func(nested_args)
@@ -2460,7 +2413,7 @@ def rejection_resample(dataset,
       shapes and types defined by `dataset.output_shapes` and
       `dataset.output_types`) to a scalar `tf.int32` tensor.  Values should
       be in `[0, num_classes)`.
-    target_dist: A floating point type tensor, shaped `[num_classes].
+    target_dist: A floating point type tensor, shaped `[num_classes]`.
     initial_dist: (Optional.)  A floating point type tensor, shaped
       `[num_classes]`.  If not provided, the true class distribution is
       estimated live in a streaming fashion.
@@ -2660,3 +2613,149 @@ def _get_file_names(file_pattern, randomize_input):
   if not randomize_input:
     file_names = sorted(file_names)
   return file_names
+
+
+class GroupByWindowDataset(Dataset):
+  """A `Dataset` that groups its input and performs a windowed reduction."""
+
+  def __init__(self, input_dataset, key_func, reduce_func, window_size_func):
+    """See `group_by_window()` for details."""
+    super(GroupByWindowDataset, self).__init__()
+
+    self._input_dataset = input_dataset
+
+    self._make_key_func(key_func, input_dataset)
+    self._make_reduce_func(reduce_func, input_dataset)
+    self._make_window_size_func(window_size_func)
+
+  def _make_window_size_func(self, window_size_func):
+    """Make wrapping Defun for window_size_func."""
+
+    @function.Defun(dtypes.int64)
+    def tf_window_size_func(key):
+      key.set_shape([])
+      window_size = ops.convert_to_tensor(
+          window_size_func(key), dtype=dtypes.int64)
+      if window_size.dtype != dtypes.int64:
+        raise ValueError(
+            "`window_size_func` must return a single tf.int64 tensor.")
+      return window_size
+
+    self._window_size_func = tf_window_size_func
+    self._window_size_func.add_to_graph(ops.get_default_graph())
+
+  def _make_key_func(self, key_func, input_dataset):
+    """Make wrapping Defun for key_func."""
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_key_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+      if _should_unpack_args(nested_args):
+        ret = key_func(*nested_args)
+      else:
+        ret = key_func(nested_args)
+      ret = ops.convert_to_tensor(ret, dtype=dtypes.int64)
+      if ret.dtype != dtypes.int64:
+        raise ValueError("`key_func` must return a single tf.int64 tensor.")
+      return ret
+
+    self._key_func = tf_key_func
+    self._key_func.add_to_graph(ops.get_default_graph())
+
+  def _make_reduce_func(self, reduce_func, input_dataset):
+    """Make wrapping Defun for reduce_func."""
+
+    @function.Defun(dtypes.int64, dtypes.resource)
+    def tf_reduce_func(key, window_dataset_resource):
+      """A wrapper for Defun that facilitates shape inference."""
+      key.set_shape([])
+      window_dataset = _ResourceDataset(window_dataset_resource,
+                                        input_dataset.output_types,
+                                        input_dataset.output_shapes)
+      output_dataset = reduce_func(key, window_dataset)
+      if not isinstance(output_dataset, Dataset):
+        raise TypeError("`reduce_func` must return a `Dataset` object.")
+      self._output_types = output_dataset.output_types
+      self._output_shapes = output_dataset.output_shapes
+      return output_dataset.make_dataset_resource()
+
+    self._reduce_func = tf_reduce_func
+    self._reduce_func.add_to_graph(ops.get_default_graph())
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.group_by_window_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._key_func.captured_inputs,
+        self._reduce_func.captured_inputs,
+        self._window_size_func.captured_inputs,
+        key_func=self._key_func,
+        reduce_func=self._reduce_func,
+        window_size_func=self._window_size_func,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+
+def group_by_window(dataset,
+                    key_func,
+                    reduce_func,
+                    window_size=None,
+                    window_size_func=None):
+  """Performs a windowed "group-by" operation on this dataset.
+
+  This method maps each consecutive element in this dataset to a key
+  using `key_func` and groups the elements by key. It then applies
+  `reduce_func` to at most `window_size_func(key)` elements matching the same
+  key. All execpt the final window for each key will contain
+  `window_size_func(key)` elements; the final window may be smaller.
+
+  You may provide either a constant `window_size` or a window size determined by
+  the key through `window_size_func`.
+
+  Args:
+    dataset: A `Dataset`.
+    key_func: A function mapping a nested structure of tensors
+      (having shapes and types defined by `self.output_shapes` and
+      `self.output_types`) to a scalar `tf.int64` tensor.
+    reduce_func: A function mapping a key and a dataset of up to `batch_size`
+      consecutive elements matching that key to another dataset.
+    window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements matching the same key to combine in a single
+      batch, which will be passed to `reduce_func`. Mutually exclusive with
+      `window_size_func`.
+    window_size_func: A function mapping a key to a `tf.int64` scalar
+      `tf.Tensor`, representing the number of consecutive elements matching
+      the same key to combine in a single batch, which will be passed to
+      `reduce_func`. Mutually exclusive with `window_size`.
+
+  Returns:
+    A `Dataset`.
+
+  Raises:
+    ValueError: if neither or both of {`window_size`, `window_size_func`} are
+      passed.
+  """
+  if (window_size is not None and window_size_func or
+      not (window_size is not None or window_size_func)):
+    raise ValueError("Must pass either window_size or window_size_func.")
+
+  if window_size is not None:
+
+    def constant_window_func(unused_key):
+      return ops.convert_to_tensor(window_size, dtype=dtypes.int64)
+
+    window_size_func = constant_window_func
+
+  assert window_size_func is not None
+  return GroupByWindowDataset(dataset, key_func, reduce_func, window_size_func)
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index c78b064b4fd..c2b99d67c7f 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -341,7 +341,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "sample_stats_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/sample_stats_test.py"],
     additional_deps = [
         ":distributions_py",
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index da1cd72a6f1..699cf45a738 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -150,7 +150,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
         `N - 1` dimensions index into a batch of independent distributions and
         the last dimension represents a vector of probabilities for each
         class. Only one of `logits` or `probs` should be passed in.
-      dtype: The type of the event samples (default: int32).
+      dtype: The type of the event samples (default: float32).
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
@@ -388,7 +388,7 @@ class RelaxedOneHotCategorical(
         dimensions index into a batch of independent distributions and the last
         dimension represents a vector of probabilities for each class. Only one
         of `logits` or `probs` should be passed in.
-      dtype: The type of the event samples (default: int32).
+      dtype: The type of the event samples (default: float32).
       validate_args: Unused in this distribution.
       allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
         exception if a statistic (e.g. mean/mode/etc...) is undefined for any
diff --git a/tensorflow/contrib/framework/python/ops/checkpoint_ops.py b/tensorflow/contrib/framework/python/ops/checkpoint_ops.py
index 848e26ab966..26146790b65 100644
--- a/tensorflow/contrib/framework/python/ops/checkpoint_ops.py
+++ b/tensorflow/contrib/framework/python/ops/checkpoint_ops.py
@@ -17,440 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_checkpoint_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-
-ops.NotDifferentiable("GenerateVocabRemapping")
-ops.NotDifferentiable("LoadAndRemapMatrix")
+from tensorflow.python.training import checkpoint_ops
 
 
-def _load_and_remap_matrix(ckpt_path,
-                           old_tensor_name,
-                           new_row_vocab_offset,
-                           num_rows_to_load,
-                           new_col_vocab_size,
-                           initializer,
-                           old_row_vocab_file=None,
-                           new_row_vocab_file=None,
-                           old_col_vocab_file=None,
-                           new_col_vocab_file=None,
-                           num_row_oov_buckets=0,
-                           num_col_oov_buckets=0,
-                           max_rows_in_memory=-1):
-  """Loads a 2-D (matrix) `Tensor` from checkpoint.
-
-  Generates 1D-remappings for rows and columns using the
-  `GenerateVocabRemapping` op, and initializes any anticipated values with the
-  provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a
-  matrix that loads existing values from the checkpoint, while filling out
-  "missing" values with the newly initialized values. See
-  contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped
-  functionality (LoadAndRemapMatrix). This wrapper can be used to perform only
-  row remapping or only col remapping. If only row remapping is desired,
-  {new,old}_col_vocab_file should be `None`, and vice versa for column
-  remapping.
-
-  NOTE: This only supports div-partitioning the vocabulary on the 1st dimension
-  (row axis) via `new_row_vocab_offset`.
-
-  Args:
-    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
-      from which the old matrix `Tensor` will be loaded.
-    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-    new_row_vocab_offset: A 0-indexed integer representing what line to
-      start reading at in the new row vocabulary. Used for partitioned
-      variables.
-    num_rows_to_load: Number of rows to load for the new vocabulary (note: to
-      support variable partitioning and partial loading, this does not need to
-      be the same as the number of entries in `new_row_vocab_file`).
-    new_col_vocab_size: Number of columns to load - should be the same as the
-      number of entries in `new_col_vocab_file`, since we don't support
-      partitioning along the column axis.
-    initializer: Callable initializer function that accepts a 1-D tensor as the
-      arg to specify the shape of the returned tensor. Used to initialize
-      missing values.
-    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
-      path to the old row vocabulary file. Can be None, which represents no
-      remapping on the row axis.
-    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
-      to the new row vocabulary file. Can be None, which represents no remapping
-      on the row axis - in which case, `new_row_vocab_offset` and
-      `num_rows_to_load` work under the assumption that the new row vocab is the
-      same as the old row vocab.
-    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
-      path to the old column vocabulary file. Can be None, which represents no
-      remapping on the column axis.
-    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
-      to the new column vocabulary file. Can be None, which represents no
-      remapping on the column axis - in which case, `new_col_vocab_size` works
-      under the assumption that the new col vocab is the same as the old col
-      vocab.
-    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
-      to append. Must be >= 0.
-    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
-      columns to append. Must be >= 0.
-    max_rows_in_memory: `int` specifying the maximum number of rows to load from
-      the checkpoint at once. If less than or equal to 0, the entire matrix will
-      be loaded into memory. Setting this arg trades increased disk reads for
-      lower memory usage.
-
-  Returns:
-    A Tensor of shape `[num_rows_to_load + num_row_oov_buckets,
-    new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the
-    specified tensor in the checkpoint, and any missing or OOV values
-    initialized with the given `initializer`.
-
-  Raises:
-    ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0.
-    ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is
-      provided, while the other is not. Same for `old_col_vocab_file` and
-      `new_col_vocab_file`.
-    ValueError: If neither row vocabs or col vocabs are provided.
-  """
-  if num_row_oov_buckets < 0:
-    raise ValueError("num_row_oov_buckets must be >= 0, but received %d" %
-                     num_row_oov_buckets)
-  if num_col_oov_buckets < 0:
-    raise ValueError("num_col_oov_buckets must be >= 0, but received %d" %
-                     num_col_oov_buckets)
-
-  if bool(old_row_vocab_file) != bool(new_row_vocab_file):
-    raise ValueError(
-        "old_row_vocab_file and new_row_vocab_file must both be specified or "
-        "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'".
-        format(old_row_vocab_file, new_row_vocab_file))
-  if bool(old_col_vocab_file) != bool(new_col_vocab_file):
-    raise ValueError(
-        "old_col_vocab_file and new_col_vocab_file must both be specified or "
-        "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'".
-        format(old_col_vocab_file, new_col_vocab_file))
-
-  remap_rows = new_row_vocab_file and old_row_vocab_file
-  remap_cols = new_col_vocab_file and old_col_vocab_file
-  if not (remap_rows or remap_cols):
-    raise ValueError(
-        "Must provide either row or column vocab files. If no remapping is "
-        "necessary, consider using `tf.contrib.framework.init_from_checkpoint` "
-        "instead.")
-
-  num_rows_present = num_rows_to_load
-  if remap_rows:
-    row_remapping, num_rows_present = (
-        gen_checkpoint_ops._generate_vocab_remapping(  # pylint: disable=protected-access
-            new_vocab_file=new_row_vocab_file,
-            old_vocab_file=old_row_vocab_file,
-            new_vocab_offset=new_row_vocab_offset,
-            num_new_vocab=num_rows_to_load))
-  else:
-    # Even when the rows are not being reordered, we still need to generate a
-    # remapping to account for initializing partitioned Variables (when
-    # new_row_vocab_offset is non-zero).
-    row_remapping = math_ops.range(
-        new_row_vocab_offset,
-        new_row_vocab_offset + num_rows_to_load,
-        dtype=dtypes.int64)
-
-  col_remapping = []
-  num_cols_present = new_col_vocab_size
-  if remap_cols:
-    col_remapping, num_cols_present = (
-        gen_checkpoint_ops._generate_vocab_remapping(  # pylint: disable=protected-access
-            new_vocab_file=new_col_vocab_file,
-            old_vocab_file=old_col_vocab_file,
-            new_vocab_offset=0,  # Offset is unused for cols (no partitioning).
-            num_new_vocab=new_col_vocab_size))
-
-  init_vals = initializer([
-      num_rows_to_load * new_col_vocab_size -
-      num_rows_present * num_cols_present, 1
-  ])
-  return_tensor = gen_checkpoint_ops._load_and_remap_matrix(  # pylint: disable=protected-access
-      ckpt_path=ckpt_path,
-      old_tensor_name=old_tensor_name,
-      row_remapping=row_remapping,
-      col_remapping=col_remapping,
-      initializing_values=init_vals,
-      num_rows=num_rows_to_load,
-      num_cols=new_col_vocab_size,
-      max_rows_in_memory=max_rows_in_memory)
-
-  # Add OOV row(s) and column(s).
-  if num_row_oov_buckets > 0:
-    init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size])
-    init_row_oov_val = ops.convert_to_tensor(init_row_oov_val)
-    return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0)
-  if num_col_oov_buckets > 0:
-    # We need to add any row OOV to the new column shape.
-    init_col_oov_val = initializer(
-        [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets])
-    init_col_oov_val = ops.convert_to_tensor(init_col_oov_val)
-    return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1)
-
-  return return_tensor
-
-
-def load_and_remap_matrix_initializer(ckpt_path,
-                                      old_tensor_name,
-                                      new_row_vocab_size,
-                                      new_col_vocab_size,
-                                      old_row_vocab_file=None,
-                                      new_row_vocab_file=None,
-                                      old_col_vocab_file=None,
-                                      new_col_vocab_file=None,
-                                      num_row_oov_buckets=0,
-                                      num_col_oov_buckets=0,
-                                      initializer=None,
-                                      max_rows_in_memory=-1):
-  r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor.
-
-  The returned initializer loads a 2-D (matrix) `Tensor` with name
-  `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the
-  rows/columns according to the specified vocab files and append additional
-  out-of-vocabulary rows/columns according to the number of OOV buckets.
-
-  The format of the file at the `{old,new}_{row,col}_vocab_file` path should be
-  a text file, with each line containing a single entity within the vocabulary.
-  Let the function `line_of(f, "x")` return the 0-indexed line number of the
-  entity "x" in file f, and the function `entity_at(f, i)` return the entity at
-  line i of file f. Then, row i of the new output matrix will be taken from row
-  `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old
-  matrix. If any entity in `new_row_vocab_file` is not found in
-  `old_row_vocab_file`, that row is considered a "missing" row, and its values
-  will be initialized using the `initializer` arg. The same logic also applies
-  for the columns.
-
-  For example, assuming that:
-
-  * `old_row_vocab_file` contains "mercury\nvenus\nmars"
-  * `new_row_vocab_file` contains "venus\njupiter\nmercury"
-  * `old_col_vocab_file` contains "good\nbetter\nbest"
-  * `new_col_vocab_file` contains "good\nbest\nfantastic"
-  * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]`
-  * `w(i, j)` represents the value from row i, column j of the old matrix
-
-  Then the new output matrix will look like:
-
-  `[[w(1, 0), w(1, 2), 1],
-    [2,       3,       4],
-    [w(0, 0), w(0, 2), 5]]`
-
-  If we further specify that:
-
-  * `num_row_oov_buckets` == 2
-  * `num_col_oov_buckets` == 1
-
-  Then the new output matrix will look like:
-
-  `[[w(1, 0), w(1, 2), 1,  12],
-    [2,       3,       4,  13],
-    [w(0, 0), w(0, 2), 5,  14],
-    [6,       7,       8,  15],
-    [9,       10,      11, 16]]`
-
-  If `{old,new}_row_vocab_file` are None, we assume that the old and new row
-  vocab files are the same, and no row remapping is done. If
-  `{old,new}_col_vocab_file` are None, we assume that the old and new column
-  vocab files are the same, and no column remapping is done.
-
-  The returned initializer only supports div-partitioning along the row axis. It
-  does not support partitioning along the column axis or mod-partitioning.
-
-  NOTE: When this is used to warm-start variables, client code should use
-  `tf.lookup.index_table_from_tensor()` like
-  contrib/layers/python/layers/feature_column.py does, as opposed to
-  `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the
-  same.
-
-  Args:
-    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
-      from which the old matrix `Tensor` will be loaded.
-    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-    new_row_vocab_size: `int` specifying the number of entries in
-      `new_row_vocab_file`. If no row remapping is needed (no row vocab
-      provided), this should be equal to the number of rows to load from the old
-      matrix (which can theoretically be smaller than the number of rows in the
-      old matrix).
-    new_col_vocab_size: `int` specifying the number of entries in
-      `new_col_vocab_file`. If no column remapping is needed (no column vocab
-      provided), this should be equal to the number of columns in the old
-      matrix.
-    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
-      path to the old row vocabulary file. Can be None, which represents no
-      remapping on the row axis.
-    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
-      to the new row vocabulary file. Can be None, which represents no remapping
-      on the row axis.
-    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
-      path to the old column vocabulary file. Can be None, which represents no
-      remapping on the column axis.
-    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
-      to the new column vocabulary file. Can be None, which represents no
-      remapping on the column axis.
-    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
-      to append. Must be >= 0.
-    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
-      columns to append. Must be >= 0.
-    initializer: Initializer function to initialize missing values. Accepts a
-      1-D tensor as the arg to specify the shape of the returned tensor. If
-      `None`, defaults to using `zeros_initializer()`.
-    max_rows_in_memory: `int` specifying the maximum number of rows to load from
-      the checkpoint at once. If less than or equal to 0, the entire matrix will
-      be loaded into memory. Setting this arg trades increased disk reads for
-      lower memory usage.
-
-  Returns:
-    A variable initializer function that should be used to initialize a
-    (potentially partitioned) `Variable` whose complete shape is
-    `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +
-    num_col_oov_buckets]`.
-
-  Raises:
-    TypeError: If `initializer` is specified but not callable.
-  """
-  if initializer is None:
-    # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from
-    # Glorot and Bengio, 2010.
-    initializer = init_ops.zeros_initializer()
-
-  if not callable(initializer):
-    raise TypeError(
-        "initializer must be callable, instead of being {} of type {}.".format(
-            initializer, type(initializer)))
-
-  def _initializer(shape, dtype=dtypes.float32, partition_info=None):
-    """Variable initializer.
-
-    Args:
-      shape: Shape of `Tensor` to return. Should include OOV on both axes.
-      dtype: Must be float32.
-      partition_info: variable_scope._PartitionInfo.
-
-    Returns:
-      `Tensor` of shape `shape`.
-
-    Raises:
-      TypeError: If `dtype` is anything other than float32.
-      ValueError: For shape mismatch upon invocation.
-    """
-    # Sanity checks.
-    if dtype != dtypes.float32:
-      raise TypeError(
-          "Currently, only float32 is supported. Received dtype: {}".format(
-              dtype))
-    if len(shape) != 2:
-      raise ValueError("Expected 2-dim shape, but received: {}".format(shape))
-    if shape[0] <= 0:
-      raise ValueError(
-          "Expected 1st dim of shape to be > 0, but received shape: {}".format(
-              shape))
-    if shape[1] != (new_col_vocab_size + num_col_oov_buckets):
-      raise ValueError(
-          "Expected 2nd dim of shape to be new_col_vocab_size ({}) + "
-          "num_col_oov_buckets ({}) = {}, but received shape: {}".format(
-              new_col_vocab_size, num_col_oov_buckets,
-              new_col_vocab_size + num_col_oov_buckets, shape))
-
-    offset = 0
-    if partition_info is not None:
-      offset = partition_info.single_offset(shape)
-
-    if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets:
-      raise ValueError(
-          "Trying to initialize {} additional rows after {} rows have already "
-          "been initialized, which would exceed expected total row count of "
-          "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format(
-              shape[0], offset, new_row_vocab_size, num_row_oov_buckets,
-              new_row_vocab_size + num_row_oov_buckets))
-
-    row_oov_buckets_to_use = min(shape[0],
-                                 max(0, offset + shape[0] - new_row_vocab_size))
-    num_rows_to_load = shape[0] - row_oov_buckets_to_use
-
-    return _load_and_remap_matrix(
-        ckpt_path=ckpt_path,
-        old_tensor_name=old_tensor_name,
-        new_row_vocab_offset=offset,
-        num_rows_to_load=num_rows_to_load,
-        new_col_vocab_size=new_col_vocab_size,
-        initializer=initializer,
-        old_row_vocab_file=old_row_vocab_file,
-        new_row_vocab_file=new_row_vocab_file,
-        old_col_vocab_file=old_col_vocab_file,
-        new_col_vocab_file=new_col_vocab_file,
-        num_row_oov_buckets=row_oov_buckets_to_use,
-        num_col_oov_buckets=num_col_oov_buckets,
-        max_rows_in_memory=max_rows_in_memory)
-
-  return _initializer
-
-
-def load_embedding_initializer(ckpt_path,
-                               embedding_tensor_name,
-                               new_vocab_size,
-                               embedding_dim,
-                               old_vocab_file,
-                               new_vocab_file,
-                               num_oov_buckets=0,
-                               initializer=None,
-                               max_rows_in_memory=-1):
-  """Returns a variable initializer for loading pre-trained embeddings.
-
-  Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
-  embedding weights and remapping according to the provided vocab files. See
-  docs for `load_and_remap_matrix_initializer()` for more details.
-
-  NOTE: Only for use with div-partitioned variables / vocabularies.
-
-  Args:
-    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
-      from which the old matrix `Tensor` will be loaded.
-    embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-    new_vocab_size: Number of entries in the new vocab.
-    embedding_dim: `int` specifying the dimension of the embedding vectors from
-      the checkpoint. Must match the number of columns in the old embedding
-      matrix.
-    old_vocab_file: A scalar `Tensor` of type `string` containing the
-      path to the old vocabulary file.
-    new_vocab_file: A scalar `Tensor` of type `string` containing the
-      path to the new vocabulary file.
-    num_oov_buckets: `int` specifying the number of out-of-vocabulary
-      buckets to use. Must be >= 0.
-    initializer: Initializer function that accepts a 1-D tensor as the arg to
-      specify the shape of the returned tensor. If `None`, defaults to using
-      `truncated_normal_initializer()`.
-    max_rows_in_memory: `int` specifying the maximum number of rows to load from
-      the checkpoint at once. If less than or equal to 0, the entire matrix will
-      be loaded into memory. Setting this arg trades increased disk reads for
-      lower memory usage.
-
-  Returns:
-    A variable initializer function.
-  """
-  if initializer is None:
-    # TODO(b/25671353): This should be kept in sync with the stddev used by
-    # feature_column.py's _EmbeddingColumn.
-    initializer = init_ops.truncated_normal_initializer(
-        stddev=1.0 / math.sqrt(embedding_dim))
-
-  return load_and_remap_matrix_initializer(
-      ckpt_path=ckpt_path,
-      old_tensor_name=embedding_tensor_name,
-      new_row_vocab_size=new_vocab_size,
-      new_col_vocab_size=embedding_dim,
-      old_row_vocab_file=old_vocab_file,
-      new_row_vocab_file=new_vocab_file,
-      old_col_vocab_file=None,
-      new_col_vocab_file=None,
-      num_row_oov_buckets=num_oov_buckets,
-      num_col_oov_buckets=0,
-      initializer=initializer,
-      max_rows_in_memory=max_rows_in_memory)
+# pylint: disable=protected-access,line-too-long
+load_and_remap_matrix_initializer = checkpoint_ops._load_and_remap_matrix_initializer
+# pylint: enable=line-too-long
+load_embedding_initializer = checkpoint_ops._load_embedding_initializer
+# pylint: enable=protected-access
 
 
 def load_linear_multiclass_bias_initializer(ckpt_path,
diff --git a/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
index a11d373244d..b7b9f5c59e1 100644
--- a/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py
@@ -21,7 +21,6 @@ import os
 import numpy as np
 
 from tensorflow.contrib import framework as contrib_framework
-from tensorflow.contrib.framework.python.ops import checkpoint_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -38,250 +37,6 @@ FLAGS = flags.FLAGS
 _TESTDATA_PATH = 'contrib/framework/testdata'
 
 
-class LoadAndRemapWrappersTest(test.TestCase):
-  """Tests for the functionality of the Python wrappers."""
-
-  def setUp(self):
-    self.bundle_file = os.path.join(
-        test.test_src_dir_path(_TESTDATA_PATH), 'bundle_checkpoint')
-    self.new_feature_vocab_file = os.path.join(
-        test.test_src_dir_path(_TESTDATA_PATH), 'bundle_checkpoint_vocab.txt')
-    self.old_feature_vocab_file = os.path.join(
-        test.test_src_dir_path(_TESTDATA_PATH),
-        'bundle_checkpoint_vocab_with_oov.txt')
-    self.new_class_vocab_file = os.path.join(
-        test.test_src_dir_path(_TESTDATA_PATH), 'keyword_new.txt')
-    self.old_class_vocab_file = os.path.join(
-        test.test_src_dir_path(_TESTDATA_PATH), 'keyword.txt')
-    self.init_val = 42
-
-    def _init_val_initializer(shape, dtype=None, partition_info=None):
-      del dtype, partition_info  # Unused by this unit-testing initializer.
-      return array_ops.tile(
-          constant_op.constant([[self.init_val]], dtype=dtypes.float32), shape)
-
-    self.initializer = _init_val_initializer
-
-  def test_load_and_remap_matrix(self):
-    """Tests the end-to-end loading / remapping of weights."""
-    # _load_and_remap_matrix() is the generalized wrapper that takes in row and
-    # column vocabulary files, calls the relevant remappings, and returns the
-    # weight matrix.  Take this example to be linear multi-class by providing
-    # both row and column vocabularies.
-    remapped_matrix = checkpoint_ops._load_and_remap_matrix(
-        new_row_vocab_file=self.new_feature_vocab_file,
-        old_row_vocab_file=self.old_feature_vocab_file,
-        num_rows_to_load=4,
-        new_col_vocab_file=self.new_class_vocab_file,
-        old_col_vocab_file=self.old_class_vocab_file,
-        new_col_vocab_size=4,
-        old_tensor_name='some_scope/embeddings',
-        ckpt_path=[self.bundle_file],
-        new_row_vocab_offset=1,
-        initializer=self.initializer,
-        num_row_oov_buckets=1,
-        num_col_oov_buckets=1)
-
-    # [4 in vocab + 1 oov features, 4 in vocab + 1 oov classes].  The offset
-    # means we read
-    expected_remapped_matrix = np.concatenate(
-        [
-            np.reshape([18, 34, 50, self.init_val, self.init_val], [5, 1]),
-            np.reshape([16, 32, 48, self.init_val, self.init_val], [5, 1]),
-            np.reshape([self.init_val] * 5, [5, 1]),
-            np.reshape([17, 33, 49, self.init_val, self.init_val], [5, 1]),
-            np.reshape([self.init_val] * 5, [5, 1])
-        ],
-        axis=1)
-
-    with self.test_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
-
-  def test_load_and_remap_output_layer_weight_initializer_linear(self):
-    """Tests for the output layer initializer in the linear multi-class case."""
-    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
-        new_row_vocab_size=5,
-        new_col_vocab_file=self.new_class_vocab_file,
-        old_col_vocab_file=self.old_class_vocab_file,
-        new_col_vocab_size=4,
-        old_tensor_name='some_scope/embeddings',
-        ckpt_path=[self.bundle_file],
-        new_row_vocab_file=self.new_feature_vocab_file,
-        old_row_vocab_file=self.old_feature_vocab_file,
-        num_row_oov_buckets=1,
-        num_col_oov_buckets=1,
-        initializer=self.initializer))
-
-    expected_remapped_matrix = np.concatenate(
-        [
-            np.reshape([2, 18, 34, 50, self.init_val, self.init_val], [6, 1]),
-            np.reshape([0, 16, 32, 48, self.init_val, self.init_val], [6, 1]),
-            np.reshape([self.init_val] * 6, [6, 1]),
-            np.reshape([1, 17, 33, 49, self.init_val, self.init_val], [6, 1]),
-            np.reshape([self.init_val] * 6, [6, 1])
-        ],
-        axis=1)
-
-    # The new weight matrix is of size
-    # [5 feature vocab + 1 feature OOV, 4 class vocab + 1 class OOV].  Use a
-    # partitioned variable to confirm that the offset logic works.
-    remapped_matrix = variable_scope.get_variable(
-        name='linear/obtained_weight_matrix',
-        shape=[6, 5],
-        initializer=loading_initializer,
-        partitioner=partitioned_variables.fixed_size_partitioner(2))
-
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
-
-  def test_load_and_remap_output_layer_weight_initializer_dnn_output(self):
-    """Tests for the output layer initializer in the DNN output case."""
-    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
-        new_row_vocab_size=5,
-        new_col_vocab_file=self.new_class_vocab_file,
-        old_col_vocab_file=self.old_class_vocab_file,
-        new_col_vocab_size=4,
-        old_tensor_name='some_scope/embeddings',
-        ckpt_path=[self.bundle_file],
-        num_col_oov_buckets=1,
-        initializer=self.initializer))
-
-    expected_remapped_matrix = np.concatenate(
-        [
-            np.reshape([2, 18, 34, 50, 66], [5, 1]),
-            np.reshape([0, 16, 32, 48, 64], [5, 1]),
-            np.reshape([self.init_val] * 5, [5, 1]),
-            np.reshape([1, 17, 33, 49, 65], [5, 1]),
-            np.reshape([self.init_val] * 5, [5, 1])
-        ],
-        axis=1)
-
-    # The new weight matrix is of size
-    # [5-sized input layer, 4 class vocab + 1 class OOV].
-    remapped_matrix = variable_scope.get_variable(
-        name='dnn_output/obtained_weight_matrix',
-        shape=[5, 5],
-        initializer=loading_initializer,
-        partitioner=partitioned_variables.fixed_size_partitioner(2))
-
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
-
-  def test_initializer_with_oov_only_partition(self):
-    """Tests for the output layer initializer where one partition is all OOV."""
-    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
-        new_row_vocab_size=5,
-        new_col_vocab_file=self.new_class_vocab_file,
-        old_col_vocab_file=self.old_class_vocab_file,
-        new_col_vocab_size=4,
-        old_tensor_name='some_scope/embeddings',
-        ckpt_path=[self.bundle_file],
-        new_row_vocab_file=self.new_feature_vocab_file,
-        old_row_vocab_file=self.old_feature_vocab_file,
-        num_row_oov_buckets=5,
-        num_col_oov_buckets=1,
-        initializer=self.initializer))
-
-    expected_remapped_matrix = np.concatenate(
-        [
-            np.reshape([2, 18, 34, 50] + [self.init_val] * 6, [10, 1]),
-            np.reshape([0, 16, 32, 48] + [self.init_val] * 6, [10, 1]),
-            np.reshape([self.init_val] * 10, [10, 1]),
-            np.reshape([1, 17, 33, 49] + [self.init_val] * 6, [10, 1]),
-            np.reshape([self.init_val] * 10, [10, 1]),
-        ],
-        axis=1)
-
-    # The new weight matrix is of size
-    # [5 feature vocab + 5 feature OOV, 4 class vocab + 1 class OOV].  The
-    # second partition has only OOV.
-    remapped_matrix = variable_scope.get_variable(
-        name='linear_all_oov/obtained_weight_matrix',
-        shape=[10, 5],
-        initializer=loading_initializer,
-        partitioner=partitioned_variables.fixed_size_partitioner(2))
-
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
-
-  def test_load_and_remap_linear_multiclass_initializer_default_init(self):
-    """Tests where the zeros_initializer default is used for linear."""
-    loading_initializer = (contrib_framework.load_and_remap_matrix_initializer(
-        new_row_vocab_size=5,
-        new_col_vocab_file=self.new_class_vocab_file,
-        old_col_vocab_file=self.old_class_vocab_file,
-        new_col_vocab_size=4,
-        old_tensor_name='some_scope/embeddings',
-        ckpt_path=[self.bundle_file],
-        new_row_vocab_file=self.new_feature_vocab_file,
-        old_row_vocab_file=self.old_feature_vocab_file,
-        num_row_oov_buckets=1,
-        num_col_oov_buckets=1))
-
-    expected_remapped_matrix = np.concatenate(
-        [
-            np.reshape([2, 18, 34, 50, 0, 0], [6, 1]),
-            np.reshape([0, 16, 32, 48, 0, 0], [6, 1]),
-            np.reshape([0] * 6, [6, 1]),
-            np.reshape([1, 17, 33, 49, 0, 0], [6, 1]),
-            np.reshape([0] * 6, [6, 1])
-        ],
-        axis=1)
-
-    remapped_matrix = variable_scope.get_variable(
-        name='linear_init_fallback/obtained_weight_matrix',
-        shape=[6, 5],
-        initializer=loading_initializer,
-        partitioner=partitioned_variables.fixed_size_partitioner(2))
-
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      self.assertAllClose(expected_remapped_matrix,
-                          remapped_matrix.as_tensor().eval())
-
-  def test_load_embedding_initializer(self):
-    """Tests for the load_embedding_initializer wrapper."""
-    embedding_loading_initializer = (
-        contrib_framework.load_embedding_initializer(
-            new_vocab_file=self.new_feature_vocab_file,
-            old_vocab_file=self.old_feature_vocab_file,
-            new_vocab_size=5,
-            embedding_dim=16,
-            embedding_tensor_name='some_scope/embeddings',
-            ckpt_path=[self.bundle_file],
-            num_oov_buckets=1,
-            initializer=self.initializer))
-
-    expected_remapped_embeddings = np.concatenate(
-        [
-            np.reshape(range(64), [4, 16]),
-            np.reshape([self.init_val] * 32, [2, 16]),
-        ],
-        axis=0)
-
-    # The new weight matrix is of size
-    # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
-    # last vocab row (2nd last row) is newly initialized (wasn't found in
-    # previous vocab) and the actual last row is OOV and also newly initialized.
-    # Use a partitioned variable to confirm that the offset logic works.
-    remapped_embeddings = variable_scope.get_variable(
-        name='embedding/obtained_embedding_matrix',
-        shape=[6, 16],
-        initializer=embedding_loading_initializer,
-        partitioner=partitioned_variables.fixed_size_partitioner(2))
-
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      self.assertAllClose(expected_remapped_embeddings,
-                          remapped_embeddings.as_tensor().eval())
-
-
 class LoadMulticlassBiasTest(test.TestCase):
   """Tests for the load_linear_multiclass_bias_initializer functionality."""
 
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
new file mode 100644
index 00000000000..b2de2823563
--- /dev/null
+++ b/tensorflow/contrib/gan/BUILD
@@ -0,0 +1,27 @@
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "gan",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
new file mode 100644
index 00000000000..586e5ac331c
--- /dev/null
+++ b/tensorflow/contrib/gan/README.md
@@ -0,0 +1,4 @@
+This directory contains the TFGAN project.
+
+This file will have more details as code is added.
+
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
new file mode 100644
index 00000000000..a46b0e8d5de
--- /dev/null
+++ b/tensorflow/contrib/gan/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFGAN grouped API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index 645e364d191..bebcf079ba4 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -62,6 +62,7 @@ tf_cuda_library(
     }),
     deps = [
         ":gdr_proto_cc",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index e631c243c3c..a27bec48010 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -121,12 +121,9 @@ tf_gen_op_wrapper_py(
 
 cc_library(
     name = "image_ops_cc",
-    srcs = [
-        "ops/image_ops.cc",
-    ],
+    srcs = ["ops/image_ops.cc"],
     deps = [
         ":image_ops_kernels",
-        "//tensorflow/core",
         "//tensorflow/core:framework",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index a09045d7fda..26f0e415180 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -551,6 +551,7 @@ py_test(
     size = "small",
     srcs = ["python/keras/utils/io_utils_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py b/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py
index baa9781e71f..f6820ee0394 100644
--- a/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py
+++ b/tensorflow/contrib/keras/python/keras/utils/io_utils_test.py
@@ -57,43 +57,44 @@ class TestIOUtils(test.TestCase):
     h5_path = os.path.join(temp_dir, 'test.h5')
     create_dataset(h5_path)
 
-    # Instantiating HDF5Matrix for the training set,
-    # which is a slice of the first 150 elements
-    x_train = keras.utils.io_utils.HDF5Matrix(
-        h5_path, 'my_data', start=0, end=150)
-    y_train = keras.utils.io_utils.HDF5Matrix(
-        h5_path, 'my_labels', start=0, end=150)
+    with self.test_session():
+      # Instantiating HDF5Matrix for the training set,
+      # which is a slice of the first 150 elements
+      x_train = keras.utils.io_utils.HDF5Matrix(
+          h5_path, 'my_data', start=0, end=150)
+      y_train = keras.utils.io_utils.HDF5Matrix(
+          h5_path, 'my_labels', start=0, end=150)
 
-    # Likewise for the test set
-    x_test = keras.utils.io_utils.HDF5Matrix(
-        h5_path, 'my_data', start=150, end=200)
-    y_test = keras.utils.io_utils.HDF5Matrix(
-        h5_path, 'my_labels', start=150, end=200)
+      # Likewise for the test set
+      x_test = keras.utils.io_utils.HDF5Matrix(
+          h5_path, 'my_data', start=150, end=200)
+      y_test = keras.utils.io_utils.HDF5Matrix(
+          h5_path, 'my_labels', start=150, end=200)
 
-    # HDF5Matrix behave more or less like Numpy matrices
-    # with regard to indexing
-    self.assertEqual(y_train.shape, (150, 1))
-    # But they do not support negative indices, so don't try print(x_train[-1])
+      # HDF5Matrix behave more or less like Numpy matrices
+      # with regard to indexing
+      self.assertEqual(y_train.shape, (150, 1))
+      # But they don't support negative indices, so don't try print(x_train[-1])
 
-    self.assertEqual(y_train.dtype, np.dtype('i'))
-    self.assertEqual(y_train.ndim, 2)
-    self.assertEqual(y_train.size, 150)
+      self.assertEqual(y_train.dtype, np.dtype('i'))
+      self.assertEqual(y_train.ndim, 2)
+      self.assertEqual(y_train.size, 150)
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
-    model.compile(loss='binary_crossentropy', optimizer='sgd')
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu'))
+      model.add(keras.layers.Dense(1, activation='sigmoid'))
+      model.compile(loss='binary_crossentropy', optimizer='sgd')
 
-    # Note: you have to use shuffle='batch' or False with HDF5Matrix
-    model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False)
-    # test that evalutation and prediction
-    # don't crash and return reasonable results
-    out_pred = model.predict(x_test, batch_size=32, verbose=False)
-    out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False)
+      # Note: you have to use shuffle='batch' or False with HDF5Matrix
+      model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False)
+      # test that evalutation and prediction
+      # don't crash and return reasonable results
+      out_pred = model.predict(x_test, batch_size=32, verbose=False)
+      out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False)
 
-    self.assertEqual(out_pred.shape, (50, 1))
-    self.assertEqual(out_eval.shape, ())
-    self.assertGreater(out_eval, 0)
+      self.assertEqual(out_pred.shape, (50, 1))
+      self.assertEqual(out_eval.shape, ())
+      self.assertGreater(out_eval, 0)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 598d9aee02a..da16bf6ce64 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -2559,10 +2559,10 @@ def _create_sequence_feature_spec_for_parsing(sequence_feature_columns,
   feature_spec = create_feature_spec_for_parsing(sequence_feature_columns)
   sequence_feature_spec = {}
   for key, feature in feature_spec.items():
-    if (isinstance(feature, parsing_ops.VarLenFeature) or
-        isinstance(feature, parsing_ops.FixedLenSequenceFeature)):
+    if isinstance(feature, parsing_ops.VarLenFeature):
       sequence_feature = feature
-    elif isinstance(feature, parsing_ops.FixedLenFeature):
+    elif (isinstance(feature, parsing_ops.FixedLenFeature) or
+          isinstance(feature, parsing_ops.FixedLenSequenceFeature)):
       default_is_set = feature.default_value is not None
       if default_is_set:
         logging.warning(
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index 21ab9867102..ab65e47af88 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -912,8 +912,7 @@ class FeatureColumnTest(test.TestCase):
             parsing_ops.VarLenFeature(dtype=dtypes.float32),
         "real_valued_var_len_dense_column":
             parsing_ops.FixedLenSequenceFeature(
-                shape=[], dtype=dtypes.float32, allow_missing=True,
-                default_value=4.0),
+                shape=[], dtype=dtypes.float32, allow_missing=True),
     }
 
     self.assertDictEqual(expected_feature_spec, feature_spec)
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index 7eb410b4c72..33db93b9704 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -156,9 +156,9 @@ def optimize_loss(loss,
   loss = ops.convert_to_tensor(loss)
   contrib_framework.assert_scalar(loss)
   if global_step is None:
-    global_step = contrib_framework.get_global_step()
+    global_step = train.get_global_step()
   else:
-    contrib_framework.assert_global_step(global_step)
+    train.assert_global_step(global_step)
   with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
     # Update ops take UPDATE_OPS collection if not provided.
     if update_ops is None:
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 6fbe204ec6d..db3be9a991b 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -36,6 +36,7 @@ py_library(
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/contrib/session_bundle:exporter",
         "//tensorflow/contrib/session_bundle:gc",
+        "//tensorflow/contrib/tpu:tpu_estimator",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index c31d5d2d47d..225d8796785 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -24,7 +24,6 @@ import six
 
 from tensorflow.contrib import framework as framework_lib
 from tensorflow.contrib import layers as layers_lib
-from tensorflow.contrib import lookup as lookup_lib
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
@@ -35,6 +34,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import nn
@@ -1070,9 +1070,8 @@ class _MultiClassHead(_SingleHead):
     labels_tensor = _to_labels_tensor(labels, self._label_name)
     _check_no_sparse_tensor(labels_tensor)
     if self._label_keys:
-      table = lookup_lib.string_to_index_table_from_tensor(
-          mapping=self._label_keys,
-          name="label_id_lookup")
+      table = lookup_ops.index_table_from_tensor(self._label_keys,
+                                                 name="label_id_lookup")
       return {
           "labels": labels_tensor,
           "label_ids": table.lookup(labels_tensor),
@@ -1106,9 +1105,8 @@ class _MultiClassHead(_SingleHead):
       class_ids = math_ops.argmax(
           logits, 1, name=prediction_key.PredictionKey.CLASSES)
       if self._label_keys:
-        table = lookup_lib.index_to_string_table_from_tensor(
-            mapping=self._label_keys,
-            name="class_string_lookup")
+        table = lookup_ops.index_to_string_table_from_tensor(
+            self._label_keys, name="class_string_lookup")
         classes = table.lookup(class_ids)
       else:
         classes = class_ids
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index c35a493086e..627d4991f03 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -33,6 +33,7 @@ from tensorflow.contrib.learn.python.learn import export_strategy
 from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
@@ -221,6 +222,14 @@ class Experiment(object):
             "`estimator` must implement `tf.contrib.learn.Trainable`"
             "or `tf.estimator.`Estimator`.")
 
+    if isinstance(estimator, tpu_estimator.TPUEstimator):
+      raise ValueError(
+          "`Experiment` class cannot work with `tf.contrib.tpu.TPUEstimator`. "
+          "Please call `TPUEstimator` train/evaluate directly. \n"
+          "Details: `Experiment` class is designed for between-graph "
+          "distributed training, while `TPUEstimator` is working in in-graph "
+          "distributed mode.")
+
     super(Experiment, self).__init__()
     # Immutable fields.
     self._estimator = estimator
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index fe40d27c445..2c68edbb34b 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -32,6 +32,8 @@ from tensorflow.contrib.learn.python.learn.estimators import dnn
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
+from tensorflow.contrib.tpu.python.tpu import tpu_config
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as core_estimator
@@ -935,6 +937,20 @@ class ExperimentTest(test.TestCase):
         self.assertEqual(ex._maybe_export.call_count, 4)
         self.assertEqual(ex._call_evaluate.call_count, 4)
 
+  def test_fail_with_tpu_estimator(self):
+    def dummy_model_fn(features, labels):
+      del features, labels  # unused
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        '`Experiment` class cannot work with `tf.contrib.tpu.TPUEstimator`'):
+      experiment.Experiment(
+          tpu_estimator.TPUEstimator(model_fn=dummy_model_fn,
+                                     config=tpu_config.RunConfig(),
+                                     train_batch_size=256),
+          train_input_fn='train_input',
+          eval_input_fn='eval_input')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index bafde464afb..4c50d40aaa9 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -28,7 +28,6 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 
@@ -44,7 +43,7 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   x_is_dict, y_is_dict = isinstance(
       x_shape, dict), y_shape is not None and isinstance(y_shape, dict)
   if y_is_dict and n_classes is not None:
-    assert (isinstance(n_classes, dict))
+    assert isinstance(n_classes, dict)
 
   if batch_size is None:
     batch_size = list(x_shape.values())[0][0] if x_is_dict else x_shape[0]
@@ -322,10 +321,12 @@ class DataFeeder(object):
 
     self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())
                    ]) if x_is_dict else check_array(x, x.dtype)
-    self._y = None if y is None else \
-      dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) if y_is_dict else check_array(y, y.dtype)
+    self._y = None if y is None else (
+        dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())])
+        if y_is_dict else check_array(y, y.dtype))
 
-    # self.n_classes is not None means we're converting raw target indices to one-hot.
+    # self.n_classes is not None means we're converting raw target indices
+    # to one-hot.
     if n_classes is not None:
       if not y_is_dict:
         y_dtype = (np.int64
@@ -344,12 +345,15 @@ class DataFeeder(object):
         x_shape, y_shape, n_classes, batch_size)
 
     # Input dtype matches dtype of x.
-    self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())]) if x_is_dict \
-      else _check_dtype(self._x.dtype)
+    self._input_dtype = (
+        dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())])
+        if x_is_dict else _check_dtype(self._x.dtype))
 
-    # note: self._output_dtype = np.float32 when y is None
-    self._output_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())]) if y_is_dict \
-      else _check_dtype(self._y.dtype) if y is not None else np.float32
+    # self._output_dtype == np.float32 when y is None
+    self._output_dtype = (
+        dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())])
+        if y_is_dict else (
+            _check_dtype(self._y.dtype) if y is not None else np.float32))
 
     # self.n_classes is None means we're passing in raw target indices
     if n_classes is not None and y_is_dict:
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 1e68a3ef660..676e1f2b51c 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Utilities supporting export to SavedModel.
 
 Some contents of this file are moved to tensorflow/python/estimator/export.py:
@@ -39,6 +38,7 @@ import time
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn import export_strategy
 from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import metric_key
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import gc
 from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
@@ -75,8 +75,8 @@ FEATURES_INPUT_ALTERNATIVE_KEY = 'features_input_alternative'
 _FALLBACK_DEFAULT_OUTPUT_ALTERNATIVE_KEY = 'default_output_alternative'
 
 
-def build_standardized_signature_def(
-    input_tensors, output_tensors, problem_type):
+def build_standardized_signature_def(input_tensors, output_tensors,
+                                     problem_type):
   """Build a SignatureDef using problem type and input and output Tensors.
 
   Note that this delegates the actual creation of the signatures to methods in
@@ -116,8 +116,8 @@ def build_standardized_signature_def(
     (_, predictions), = output_tensors.items()
     return signature_def_utils.regression_signature_def(examples, predictions)
   else:
-    return signature_def_utils.predict_signature_def(
-        input_tensors, output_tensors)
+    return signature_def_utils.predict_signature_def(input_tensors,
+                                                     output_tensors)
 
 
 def _get_classification_scores(output_tensors):
@@ -139,17 +139,15 @@ def _is_classification_problem(problem_type, input_tensors, output_tensors):
   classes = _get_classification_classes(output_tensors)
   scores = _get_classification_scores(output_tensors)
   return ((problem_type == constants.ProblemType.CLASSIFICATION or
-           problem_type == constants.ProblemType.LOGISTIC_REGRESSION)
-          and len(input_tensors) == 1
-          and (classes is not None or
-               scores is not None or
-               len(output_tensors) == 1))
+           problem_type == constants.ProblemType.LOGISTIC_REGRESSION) and
+          len(input_tensors) == 1 and
+          (classes is not None or scores is not None or
+           len(output_tensors) == 1))
 
 
 def _is_regression_problem(problem_type, input_tensors, output_tensors):
-  return (problem_type == constants.ProblemType.LINEAR_REGRESSION
-          and len(input_tensors) == 1
-          and len(output_tensors) == 1)
+  return (problem_type == constants.ProblemType.LINEAR_REGRESSION and
+          len(input_tensors) == 1 and len(output_tensors) == 1)
 
 
 def get_input_alternatives(input_ops):
@@ -177,9 +175,7 @@ def get_input_alternatives(input_ops):
   return input_alternatives, features
 
 
-def get_output_alternatives(
-    model_fn_ops,
-    default_output_alternative_key=None):
+def get_output_alternatives(model_fn_ops, default_output_alternative_key=None):
   """Obtain all output alternatives using the model_fn output and heuristics.
 
   Args:
@@ -218,8 +214,10 @@ def get_output_alternatives(
       default_outputs = {prediction_key.PredictionKey.GENERIC: default_outputs}
     actual_default_output_alternative_key = (
         _FALLBACK_DEFAULT_OUTPUT_ALTERNATIVE_KEY)
-    output_alternatives = {actual_default_output_alternative_key:
-                           (default_problem_type, default_outputs)}
+    output_alternatives = {
+        actual_default_output_alternative_key: (default_problem_type,
+                                                default_outputs)
+    }
     return output_alternatives, actual_default_output_alternative_key
 
   if default_output_alternative_key:
@@ -246,13 +244,12 @@ def build_all_signature_defs(input_alternatives, output_alternatives,
                              actual_default_output_alternative_key):
   """Build `SignatureDef`s from all pairs of input and output alternatives."""
 
-  signature_def_map = {
-      ('%s:%s' % (input_key, output_key or 'None')):
-      build_standardized_signature_def(
-          inputs, outputs, problem_type)
-      for input_key, inputs in input_alternatives.items()
-      for output_key, (problem_type, outputs)
-      in output_alternatives.items()}
+  signature_def_map = {('%s:%s' % (input_key, output_key or 'None')):
+                       build_standardized_signature_def(inputs, outputs,
+                                                        problem_type)
+                       for input_key, inputs in input_alternatives.items()
+                       for output_key, (problem_type,
+                                        outputs) in output_alternatives.items()}
 
   # Add the default SignatureDef
   default_inputs = input_alternatives.get(DEFAULT_INPUT_ALTERNATIVE_KEY)
@@ -263,8 +260,8 @@ def build_all_signature_defs(input_alternatives, output_alternatives,
   (default_problem_type, default_outputs) = (
       output_alternatives[actual_default_output_alternative_key])
   signature_def_map[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = (
-      build_standardized_signature_def(
-          default_inputs, default_outputs, default_problem_type))
+      build_standardized_signature_def(default_inputs, default_outputs,
+                                       default_problem_type))
 
   return signature_def_map
 
@@ -308,9 +305,8 @@ def get_timestamped_export_dir(export_dir_base):
       return export_dir
     time.sleep(1)
     attempts += 1
-    logging.warn(
-        'Export directory {} already exists; retrying (attempt {}/{})'.format(
-            export_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
+    logging.warn('Export directory {} already exists; retrying (attempt {}/{})'.
+                 format(export_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
   raise RuntimeError('Failed to obtain a unique export directory name after '
                      '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
 
@@ -330,8 +326,7 @@ def get_temp_export_dir(timestamped_export_dir):
   """
   (dirname, basename) = os.path.split(timestamped_export_dir)
   temp_export_dir = os.path.join(
-      compat.as_bytes(dirname),
-      compat.as_bytes('temp-{}'.format(basename)))
+      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
   return temp_export_dir
 
 
@@ -357,8 +352,8 @@ def get_most_recent_export(export_dir_base):
     A gc.Path, with is just a namedtuple of (path, export_version).
   """
   select_filter = gc.largest_export_versions(1)
-  results = select_filter(gc.get_paths(export_dir_base,
-                                       parser=_export_version_parser))
+  results = select_filter(
+      gc.get_paths(export_dir_base, parser=_export_version_parser))
   return next(iter(results or []), None)
 
 
@@ -378,8 +373,8 @@ def garbage_collect_exports(export_dir_base, exports_to_keep):
 
   keep_filter = gc.largest_export_versions(exports_to_keep)
   delete_filter = gc.negation(keep_filter)
-  for p in delete_filter(gc.get_paths(export_dir_base,
-                                      parser=_export_version_parser)):
+  for p in delete_filter(
+      gc.get_paths(export_dir_base, parser=_export_version_parser)):
     try:
       gfile.DeleteRecursively(p.path)
     except errors_impl.NotFoundError as e:
@@ -416,10 +411,7 @@ def make_export_strategy(serving_input_fn,
     An ExportStrategy that can be passed to the Experiment constructor.
   """
 
-  def export_fn(estimator,
-                export_dir_base,
-                checkpoint_path=None
-               ):
+  def export_fn(estimator, export_dir_base, checkpoint_path=None):
     """Exports the given Estimator as a SavedModel.
 
     Args:
@@ -512,3 +504,128 @@ def make_parsing_export_strategy(feature_columns,
       assets_extra=assets_extra,
       as_text=as_text,
       exports_to_keep=exports_to_keep)
+
+
+def _default_compare_fn(curr_best_eval_result, cand_eval_result):
+  """Compares two evaluation results and returns true if the 2nd one is better.
+
+  Both evaluation results should have the values for MetricKey.LOSS, which are
+  used for comparison.
+
+  Args:
+    curr_best_eval_result: current best eval metrics.
+    cand_eval_result: candidate eval metrics.
+
+  Returns:
+    True if cand_eval_result is better.
+
+  Raises:
+    ValueError: If input eval result is None or no loss is available.
+  """
+  default_key = metric_key.MetricKey.LOSS
+  if not curr_best_eval_result or default_key not in curr_best_eval_result:
+    raise ValueError(
+        'curr_best_eval_result cannot be empty or no loss is found in it.')
+
+  if not cand_eval_result or default_key not in cand_eval_result:
+    raise ValueError(
+        'cand_eval_result cannot be empty or no loss is found in it.')
+
+  return curr_best_eval_result[default_key] > cand_eval_result[default_key]
+
+
+class BestModelSelector(object):
+  """A helper that keeps track of export selection candidates."""
+
+  def __init__(self, compare_fn=None):
+    """Constructor of this class.
+
+    Args:
+      compare_fn: a function that returns true if the candidate is better than
+        the current best model.
+    """
+    self._best_eval_result = None
+    self._compare_fn = compare_fn or _default_compare_fn
+
+  def update(self, checkpoint_path, eval_result):
+    """Records a given checkpoint and exports if this is the best model.
+
+    Args:
+      checkpoint_path: the checkpoint path to export.
+      eval_result: a dictionary which is usually generated in evaluation runs.
+        By default, eval_results contains 'loss' field.
+
+    Returns:
+      A string representing the path to the checkpoint to be exported.
+      A dictionary of the same type of eval_result.
+
+    Raises:
+      ValueError: if checkpoint path is empty.
+      ValueError: if eval_results is None object.
+    """
+    if not checkpoint_path:
+      raise ValueError('Checkpoint path is empty.')
+    if eval_result is None:
+      raise ValueError('%s has empty evaluation results.', checkpoint_path)
+
+    if (self._best_eval_result is None or
+        self._compare_fn(self._best_eval_result, eval_result)):
+      self._best_eval_result = eval_result
+      return checkpoint_path, eval_result
+    else:
+      return '', None
+
+
+def make_best_model_export_strategy(serving_input_fn,
+                                    exports_to_keep=1,
+                                    compare_fn=None,
+                                    default_output_alternative_key=None):
+  """Creates an custom ExportStrategy for use with tf.contrib.learn.Experiment.
+
+  Args:
+    serving_input_fn: a function that takes no arguments and returns an
+      `InputFnOps`.
+    exports_to_keep: an integer indicating how many historical best models need
+      to be preserved.
+    compare_fn: a function that select the 'best' candidate from a dictionary
+        of evaluation result keyed by corresponding checkpoint path.
+    default_output_alternative_key: the key for default serving signature for
+        multi-headed inference graphs.
+
+  Returns:
+    An ExportStrategy that can be passed to the Experiment constructor.
+  """
+  best_model_export_strategy = make_export_strategy(
+      serving_input_fn,
+      exports_to_keep=exports_to_keep,
+      default_output_alternative_key=default_output_alternative_key)
+
+  best_model_selector = BestModelSelector(compare_fn)
+
+  def export_fn(estimator, export_dir_base, checkpoint_path, eval_result=None):
+    """Exports the given Estimator as a SavedModel.
+
+    Args:
+      estimator: the Estimator to export.
+      export_dir_base: A string containing a directory to write the exported
+        graph and checkpoints.
+      checkpoint_path: The checkpoint path to export.  If None (the default),
+        the most recent checkpoint found within the model directory is chosen.
+      eval_result: placehold args matching the call signature of ExportStrategy.
+
+    Returns:
+      The string path to the exported directory.
+    """
+
+    export_checkpoint_path, export_eval_result = best_model_selector.update(
+        checkpoint_path, eval_result)
+
+    if export_checkpoint_path and export_eval_result is not None:
+      checkpoint_base = os.path.basename(export_checkpoint_path)
+      export_dir = os.path.join(export_dir_base, checkpoint_base)
+      return best_model_export_strategy.export(
+          estimator, export_dir, export_checkpoint_path, export_eval_result)
+    else:
+      return ''
+
+  return export_strategy.ExportStrategy('best_model', export_fn)
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
index 9e778ab72ad..66bca9c0f53 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
@@ -24,6 +24,7 @@ import time
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.learn.python.learn import export_strategy as export_strategy_lib
 from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import estimator as core_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
 from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
@@ -40,18 +41,43 @@ from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
 
 
+class TestEstimator(core_estimator.Estimator):
+
+  def __init__(self, *args, **kwargs):
+    super(TestEstimator, self).__init__(*args, **kwargs)
+    self.last_exported_checkpoint = ""
+    self.last_exported_dir = ""
+
+  # @Override
+  def export_savedmodel(self,
+                        export_dir,
+                        serving_input_fn,
+                        default_output_alternative_key=None,
+                        assets_extra=None,
+                        as_text=False,
+                        checkpoint_path=None):
+
+    if not os.path.exists(export_dir):
+      os.makedirs(export_dir)
+
+    open(os.path.join(export_dir, "placeholder.txt"), "a").close()
+
+    self.last_exported_checkpoint = checkpoint_path
+    self.last_exported_dir = export_dir
+
+    return export_dir
+
+
 class SavedModelExportUtilsTest(test.TestCase):
 
   def test_build_standardized_signature_def_regression(self):
     input_tensors = {
         "input-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="input-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1")
     }
     output_tensors = {
         "output-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="output-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="output-tensor-1")
     }
     problem_type = constants.ProblemType.LINEAR_REGRESSION
     actual_signature_def = (
@@ -61,10 +87,9 @@ class SavedModelExportUtilsTest(test.TestCase):
     shape = tensor_shape_pb2.TensorShapeProto(
         dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
     dtype = types_pb2.DataType.Value("DT_FLOAT")
-    expected_signature_def.inputs[
-        signature_constants.REGRESS_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(
-                name="input-tensor-1:0", dtype=dtype, tensor_shape=shape))
+    expected_signature_def.inputs[signature_constants.REGRESS_INPUTS].CopyFrom(
+        meta_graph_pb2.TensorInfo(
+            name="input-tensor-1:0", dtype=dtype, tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.REGRESS_OUTPUTS].CopyFrom(
             meta_graph_pb2.TensorInfo(
@@ -77,13 +102,11 @@ class SavedModelExportUtilsTest(test.TestCase):
     """Tests classification with one output tensor."""
     input_tensors = {
         "input-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="input-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1")
     }
     output_tensors = {
         "output-1":
-            array_ops.placeholder(
-                dtypes.string, 1, name="output-tensor-1")
+            array_ops.placeholder(dtypes.string, 1, name="output-tensor-1")
     }
     problem_type = constants.ProblemType.CLASSIFICATION
     actual_signature_def = (
@@ -94,14 +117,14 @@ class SavedModelExportUtilsTest(test.TestCase):
         dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
     dtype_float = types_pb2.DataType.Value("DT_FLOAT")
     dtype_string = types_pb2.DataType.Value("DT_STRING")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(
-                name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
+    expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom(
+        meta_graph_pb2.TensorInfo(
+            name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
             meta_graph_pb2.TensorInfo(
-                name="output-tensor-1:0", dtype=dtype_string,
+                name="output-tensor-1:0",
+                dtype=dtype_string,
                 tensor_shape=shape))
 
     expected_signature_def.method_name = (
@@ -112,8 +135,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     """Tests multiple output tensors that include classes and probabilities."""
     input_tensors = {
         "input-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="input-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1")
     }
     output_tensors = {
         "classes":
@@ -136,19 +158,20 @@ class SavedModelExportUtilsTest(test.TestCase):
         dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
     dtype_float = types_pb2.DataType.Value("DT_FLOAT")
     dtype_string = types_pb2.DataType.Value("DT_STRING")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(
-                name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
+    expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom(
+        meta_graph_pb2.TensorInfo(
+            name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
             meta_graph_pb2.TensorInfo(
-                name="output-tensor-classes:0", dtype=dtype_string,
+                name="output-tensor-classes:0",
+                dtype=dtype_string,
                 tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
             meta_graph_pb2.TensorInfo(
-                name="output-tensor-proba:0", dtype=dtype_float,
+                name="output-tensor-proba:0",
+                dtype=dtype_float,
                 tensor_shape=shape))
 
     expected_signature_def.method_name = (
@@ -159,8 +182,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     """Tests multiple output tensors that include classes and scores."""
     input_tensors = {
         "input-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="input-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1")
     }
     output_tensors = {
         "classes":
@@ -182,19 +204,20 @@ class SavedModelExportUtilsTest(test.TestCase):
         dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
     dtype_float = types_pb2.DataType.Value("DT_FLOAT")
     dtype_string = types_pb2.DataType.Value("DT_STRING")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(
-                name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
+    expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom(
+        meta_graph_pb2.TensorInfo(
+            name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
             meta_graph_pb2.TensorInfo(
-                name="output-tensor-classes:0", dtype=dtype_string,
+                name="output-tensor-classes:0",
+                dtype=dtype_string,
                 tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
             meta_graph_pb2.TensorInfo(
-                name="output-tensor-scores:0", dtype=dtype_float,
+                name="output-tensor-scores:0",
+                dtype=dtype_float,
                 tensor_shape=shape))
 
     expected_signature_def.method_name = (
@@ -205,8 +228,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     """Tests classification without classes tensor."""
     input_tensors = {
         "input-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="input-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1")
     }
     output_tensors = {
         "probabilities":
@@ -224,14 +246,14 @@ class SavedModelExportUtilsTest(test.TestCase):
     shape = tensor_shape_pb2.TensorShapeProto(
         dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
     dtype_float = types_pb2.DataType.Value("DT_FLOAT")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(
-                name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
+    expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom(
+        meta_graph_pb2.TensorInfo(
+            name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
             meta_graph_pb2.TensorInfo(
-                name="output-tensor-proba:0", dtype=dtype_float,
+                name="output-tensor-proba:0",
+                dtype=dtype_float,
                 tensor_shape=shape))
 
     expected_signature_def.method_name = (
@@ -246,8 +268,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     """
     input_tensors = {
         "input-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="input-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1")
     }
     output_tensors = {
         "classes":
@@ -268,14 +289,14 @@ class SavedModelExportUtilsTest(test.TestCase):
     shape = tensor_shape_pb2.TensorShapeProto(
         dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
     dtype_float = types_pb2.DataType.Value("DT_FLOAT")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(
-                name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
+    expected_signature_def.inputs[signature_constants.CLASSIFY_INPUTS].CopyFrom(
+        meta_graph_pb2.TensorInfo(
+            name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
             meta_graph_pb2.TensorInfo(
-                name="output-tensor-scores:0", dtype=dtype_float,
+                name="output-tensor-scores:0",
+                dtype=dtype_float,
                 tensor_shape=shape))
 
     expected_signature_def.method_name = (
@@ -290,8 +311,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     """
     input_tensors = {
         "input-1":
-            array_ops.placeholder(
-                dtypes.float32, 1, name="input-tensor-1")
+            array_ops.placeholder(dtypes.float32, 1, name="input-tensor-1")
     }
     output_tensors = {
         "classes":
@@ -310,17 +330,18 @@ class SavedModelExportUtilsTest(test.TestCase):
         dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
     dtype_int64 = types_pb2.DataType.Value("DT_INT64")
     dtype_float = types_pb2.DataType.Value("DT_FLOAT")
-    expected_signature_def.inputs[
-        "input-1"].CopyFrom(
-            meta_graph_pb2.TensorInfo(
-                name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
+    expected_signature_def.inputs["input-1"].CopyFrom(
+        meta_graph_pb2.TensorInfo(
+            name="input-tensor-1:0", dtype=dtype_float, tensor_shape=shape))
     expected_signature_def.outputs["classes"].CopyFrom(
         meta_graph_pb2.TensorInfo(
-            name="output-tensor-classes:0", dtype=dtype_int64,
+            name="output-tensor-classes:0",
+            dtype=dtype_int64,
             tensor_shape=shape))
     expected_signature_def.outputs["logits"].CopyFrom(
         meta_graph_pb2.TensorInfo(
-            name="output-tensor-logits:0", dtype=dtype_float,
+            name="output-tensor-logits:0",
+            dtype=dtype_float,
             tensor_shape=shape))
 
     expected_signature_def.method_name = (
@@ -379,8 +400,9 @@ class SavedModelExportUtilsTest(test.TestCase):
   def test_get_output_alternatives_single_no_default(self):
     prediction_tensor = constant_op.constant(["bogus"])
     provided_output_alternatives = {
-        "head-1": (constants.ProblemType.LINEAR_REGRESSION,
-                   {"output": prediction_tensor}),
+        "head-1": (constants.ProblemType.LINEAR_REGRESSION, {
+            "output": prediction_tensor
+        }),
     }
     model_fn_ops = model_fn.ModelFnOps(
         model_fn.ModeKeys.INFER,
@@ -390,10 +412,11 @@ class SavedModelExportUtilsTest(test.TestCase):
     output_alternatives, _ = saved_model_export_utils.get_output_alternatives(
         model_fn_ops)
 
-    self.assertEqual({"head-1":
-                      (constants.ProblemType.LINEAR_REGRESSION,
-                       {"output": prediction_tensor})},
-                     output_alternatives)
+    self.assertEqual({
+        "head-1": (constants.ProblemType.LINEAR_REGRESSION, {
+            "output": prediction_tensor
+        })
+    }, output_alternatives)
 
   def test_get_output_alternatives_multi_no_default(self):
     provided_output_alternatives = {
@@ -424,10 +447,11 @@ class SavedModelExportUtilsTest(test.TestCase):
     output_alternatives, _ = saved_model_export_utils.get_output_alternatives(
         model_fn_ops)
 
-    self.assertEqual(
-        {"default_output_alternative": (constants.ProblemType.UNSPECIFIED, {
-            "some_output": prediction_tensor})},
-        output_alternatives)
+    self.assertEqual({
+        "default_output_alternative": (constants.ProblemType.UNSPECIFIED, {
+            "some_output": prediction_tensor
+        })
+    }, output_alternatives)
 
   def test_get_output_alternatives_empty_provided_with_default(self):
     prediction_tensor = constant_op.constant(["bogus"])
@@ -452,10 +476,11 @@ class SavedModelExportUtilsTest(test.TestCase):
     output_alternatives, _ = saved_model_export_utils.get_output_alternatives(
         model_fn_ops)
 
-    self.assertEqual(
-        {"default_output_alternative": (constants.ProblemType.UNSPECIFIED, {
-            "some_output": prediction_tensor})},
-        output_alternatives)
+    self.assertEqual({
+        "default_output_alternative": (constants.ProblemType.UNSPECIFIED, {
+            "some_output": prediction_tensor
+        })
+    }, output_alternatives)
 
   def test_get_output_alternatives_implicit_single(self):
     prediction_tensor = constant_op.constant(["bogus"])
@@ -506,14 +531,14 @@ class SavedModelExportUtilsTest(test.TestCase):
 
     expected_signature_defs = {
         "serving_default":
-            signature_def_utils.regression_signature_def(input_example,
-                                                         output_1),
+            signature_def_utils.regression_signature_def(
+                input_example, output_1),
         "default_input_alternative:head-1":
-            signature_def_utils.regression_signature_def(input_example,
-                                                         output_1),
+            signature_def_utils.regression_signature_def(
+                input_example, output_1),
         "default_input_alternative:head-2":
-            signature_def_utils.classification_signature_def(input_example,
-                                                             output_2, None),
+            signature_def_utils.classification_signature_def(
+                input_example, output_2, None),
         "default_input_alternative:head-3":
             signature_def_utils.predict_signature_def({
                 "default input": input_example
@@ -624,17 +649,20 @@ class SavedModelExportUtilsTest(test.TestCase):
     (most_recent_export_dir, most_recent_export_version) = (
         saved_model_export_utils.get_most_recent_export(export_dir_base))
 
-    self.assertEqual(compat.as_bytes(export_dir_4),
-                     compat.as_bytes(most_recent_export_dir))
-    self.assertEqual(compat.as_bytes(export_dir_4),
-                     os.path.join(compat.as_bytes(export_dir_base),
-                                  compat.as_bytes(
-                                      str(most_recent_export_version))))
+    self.assertEqual(
+        compat.as_bytes(export_dir_4), compat.as_bytes(most_recent_export_dir))
+    self.assertEqual(
+        compat.as_bytes(export_dir_4),
+        os.path.join(
+            compat.as_bytes(export_dir_base),
+            compat.as_bytes(str(most_recent_export_version))))
 
   def test_make_export_strategy(self):
     """Only tests that an ExportStrategy instance is created."""
+
     def _serving_input_fn():
       return array_ops.constant([1]), None
+
     export_strategy = saved_model_export_utils.make_export_strategy(
         serving_input_fn=_serving_input_fn,
         default_output_alternative_key="default",
@@ -655,14 +683,61 @@ class SavedModelExportUtilsTest(test.TestCase):
     real_valued_col1 = fc.real_valued_column("real_valued_column1")
     bucketized_col1 = fc.bucketized_column(
         fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
-    feature_columns = [sparse_col, embedding_col, real_valued_col1,
-                       bucketized_col1]
+    feature_columns = [
+        sparse_col, embedding_col, real_valued_col1, bucketized_col1
+    ]
 
     export_strategy = saved_model_export_utils.make_parsing_export_strategy(
         feature_columns=feature_columns)
     self.assertTrue(
         isinstance(export_strategy, export_strategy_lib.ExportStrategy))
 
+  def test_make_best_model_export_strategy(self):
+    export_dir_base = tempfile.mkdtemp() + "export/"
+    gfile.MkDir(export_dir_base)
+
+    test_estimator = TestEstimator()
+    export_strategy = saved_model_export_utils.make_best_model_export_strategy(
+        serving_input_fn=None, exports_to_keep=3, compare_fn=None)
+
+    self.assertNotEqual("",
+                        export_strategy.export(test_estimator, export_dir_base,
+                                               "fake_ckpt_0", {"loss": 100}))
+    self.assertNotEqual("", test_estimator.last_exported_dir)
+    self.assertNotEqual("", test_estimator.last_exported_checkpoint)
+
+    self.assertEqual("",
+                     export_strategy.export(test_estimator, export_dir_base,
+                                            "fake_ckpt_1", {"loss": 101}))
+    self.assertEqual(test_estimator.last_exported_dir,
+                     os.path.join(export_dir_base, "fake_ckpt_0"))
+
+    self.assertNotEqual("",
+                        export_strategy.export(test_estimator, export_dir_base,
+                                               "fake_ckpt_2", {"loss": 10}))
+    self.assertEqual(test_estimator.last_exported_dir,
+                     os.path.join(export_dir_base, "fake_ckpt_2"))
+
+    self.assertEqual("",
+                     export_strategy.export(test_estimator, export_dir_base,
+                                            "fake_ckpt_3", {"loss": 20}))
+    self.assertEqual(test_estimator.last_exported_dir,
+                     os.path.join(export_dir_base, "fake_ckpt_2"))
+
+  def test_make_best_model_export_strategy_exceptions(self):
+    export_dir_base = tempfile.mkdtemp() + "export/"
+
+    test_estimator = TestEstimator()
+    export_strategy = saved_model_export_utils.make_best_model_export_strategy(
+        serving_input_fn=None, exports_to_keep=3, compare_fn=None)
+
+    with self.assertRaises(ValueError):
+      export_strategy.export(test_estimator, export_dir_base, "", {"loss": 200})
+
+    with self.assertRaises(ValueError):
+      export_strategy.export(test_estimator, export_dir_base, "fake_ckpt_1",
+                             None)
+
 
 def _create_test_export_dir(export_dir_base):
   export_dir = saved_model_export_utils.get_timestamped_export_dir(
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 00cde08bff1..9b959b43a9d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -1496,6 +1496,15 @@ class StreamingAUCTest(test.TestCase):
       for _ in range(10):
         self.assertAlmostEqual(initial_auc, auc.eval(), 5)
 
+  def testPredictionsOutOfRange(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [1, -1, 1, -1], shape=(1, 4), dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+      _, update_op = metrics.streaming_auc(predictions, labels)
+      sess.run(variables.local_variables_initializer())
+      self.assertRaises(errors_impl.InvalidArgumentError, update_op.eval)
+
   def testAllCorrect(self):
     self.allCorrectAsExpected('ROC')
 
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index 338181e4cac..d6508362b8b 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -48,6 +48,8 @@ tf_cuda_cc_test(
     # Disabled on jenkins until errors finding nvmlShutdown are found.
     tags = [
         "manual",
+        "no_oss",
+        "noguitar",  # note: is run manually there
         "notap",
     ],
     deps = if_cuda(
@@ -112,25 +114,26 @@ tf_custom_op_py_library(
     ],
 )
 
-# http://b/62064807
-# cuda_py_test(
-#     name = "nccl_ops_test",
-#     size = "small",
-#     srcs = ["python/ops/nccl_ops_test.py"],
-#     additional_deps = [
-#         ":nccl_py",
-#         "//tensorflow/python:array_ops",
-#         "//tensorflow/python:client_testlib",
-#         "//tensorflow/python:framework_for_generated_wrappers",
-#         "//tensorflow/python:framework_test_lib",
-#         "//tensorflow/python:platform_test",
-#     ],
-#     # Disabled on jenkins until errors finding nvmlShutdown are found.
-#     tags = [
-#         "manual",
-#         "notap",
-#     ],
-# )
+cuda_py_test(
+    name = "nccl_ops_test",
+    size = "small",
+    srcs = ["python/ops/nccl_ops_test.py"],
+    additional_deps = [
+        ":nccl_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    # Disabled on jenkins until errors finding nvmlShutdown are found.
+    tags = [
+        "manual",
+        "no_oss",
+        "noguitar",  # note: is run manually there
+        "notap",
+    ],
+)
 
 filegroup(
     name = "all_files",
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index ae658e73227..1621e9f28e3 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -43,7 +43,8 @@ class AllReduceTest(test.TestCase):
         self._testSingleAllReduce(sess, dtype, nccl.all_max, np.maximum)
 
   def _testSingleAllReduce(self, sess, np_type, nccl_fn, numpy_accumulation_fn):
-    for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
+    for devices in [['/device:GPU:1', '/device:GPU:2', '/device:GPU:0'],
+                    ['/device:GPU:1', '/device:GPU:0']]:
       shape = (3, 4)
       np_ans = None
       tensors = []
@@ -84,7 +85,8 @@ class BroadcastTest(test.TestCase):
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
-        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
+        for devices in [['/device:GPU:1', '/device:GPU:0', '/device:GPU:2'],
+                        ['/device:GPU:1', '/device:GPU:0']]:
           shape = (3, 4)
           sender = np.random.randint(0, len(devices) - 1)
           with ops.device(devices[sender]):
@@ -115,7 +117,8 @@ class CombinedTest(test.TestCase):
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
-        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
+        for devices in [['/device:GPU:1', '/device:GPU:2', '/device:GPU:0'],
+                        ['/device:GPU:0', '/device:GPU:1']]:
           shape = (3, 4)
 
           # all-reduce
diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD
new file mode 100644
index 00000000000..ed2f3af08cb
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/BUILD
@@ -0,0 +1,71 @@
+# Description:
+#   Contains modules to compute receptive field parameters for CNN models.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+# Transitive dependencies of this target will be included in the pip package.
+py_library(
+    name = "receptive_field_pip",
+    deps = [
+        ":graph_compute_order_py",
+        ":receptive_field_py",
+    ],
+)
+
+py_library(
+    name = "graph_compute_order_py",
+    srcs = [
+        "__init__.py",
+        "python/util/graph_compute_order.py",
+    ],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "receptive_field_py",
+    srcs = [
+        "__init__.py",
+        "python/util/receptive_field.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":graph_compute_order_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_test(
+    name = "receptive_field_test",
+    srcs = ["python/util/receptive_field_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":receptive_field_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/slim",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:nn",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/receptive_field/README.md b/tensorflow/contrib/receptive_field/README.md
new file mode 100644
index 00000000000..b150b903b23
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/README.md
@@ -0,0 +1,165 @@
+# Receptive field computation for convnets
+
+This library enables you to easily compute the receptive field parameters of
+your favorite convnet. You can use it to understand how big of an input image
+region your output features depend on. Better yet, using the parameters computed
+by the library, you can easily find the exact image region which is used to
+compute each convnet feature.
+
+## Basic usage
+
+The main function to be called is `compute_receptive_field_from_graph_def`,
+which will return the receptive field, effective stride and effective padding
+for both horizontal and vertical directions.
+
+For example, if your model is constructed using the function
+`my_model_construction()`, you can use the library as follows:
+
+```python
+import tensorflow as tf
+from tensorflow.contrib import receptive_field
+
+# Construct graph.
+g = tf.Graph()
+with g.as_default():
+  images = tf.placeholder(tf.float32, shape=(1, None, None, 3), name='input_image')
+  my_model_construction(images)
+
+# Compute receptive field parameters.
+rf_x, rf_y, eff_stride_x, eff_stride_y, eff_pad_x, eff_pad_y = \
+  receptive_field.compute_receptive_field_from_graph_def( \
+    g.as_graph_def(), 'input_image', 'my_output_endpoint')
+```
+
+Here's a simple example of computing the receptive field parameters for
+Inception-Resnet-v2. To get this to work, be sure to checkout
+[tensorflow/models](https://github.com/tensorflow/models), so that the Inception
+models are available to you. This can be done in three simple commands:
+
+```sh
+git clone https://github.com/tensorflow/models
+cd models/slim
+sudo python setup.py install_lib
+```
+
+You can then compute the receptive field parameters for Inception-Resnet-v2 as:
+
+```python
+from nets import inception
+import tensorflow as tf
+from tensorflow.contrib import receptive_field
+
+# Construct graph.
+g = tf.Graph()
+with g.as_default():
+  images = tf.placeholder(tf.float32, shape=(1, None, None, 3), name='input_image')
+  inception.inception_resnet_v2_base(images)
+
+# Compute receptive field parameters.
+rf_x, rf_y, eff_stride_x, eff_stride_y, eff_pad_x, eff_pad_y = \
+  receptive_field.compute_receptive_field_from_graph_def( \
+    g.as_graph_def(), 'input_image', 'InceptionResnetV2/Conv2d_7b_1x1/Relu')
+```
+
+This will give you `rf_x = rf_y = 3039`, `eff_stride_x = eff_stride_y = 32`, and
+`eff_pad_x = eff_pad_y = 1482`. This means that each feature that is output at
+the node `'InceptionResnetV2/Conv2d_7b_1x1/Relu'` is computed from a region
+which is of size `3039x3039`. Further, by using the expressions
+
+```python
+center_x = -eff_pad_x + feature_x*eff_stride_x + (rf_x - 1)/2
+center_y = -eff_pad_y + feature_y*eff_stride_y + (rf_y - 1)/2
+```
+
+one can compute the center of the region in the input image that is used to
+compute the output feature at position `[feature_x, feature_y]`. For example,
+the feature at position `[0, 2]` at the output of the layer
+`'InceptionResnetV2/Conv2d_7b_1x1/Relu'` is centered in the original image in
+the position `[37, 101]`.
+
+TODO: include link to derivations and definitions of different parameters.
+
+## Receptive field benchmark
+
+As you might expect, it is straightforward to run this library on the popular
+convnets, and gather their receptive fields. We provide a python script which
+does exactly that, available under `python/util/examples/rf_benchmark.py`.
+
+To get this to work, be sure to checkout
+[tensorflow/models](https://github.com/tensorflow/models) (see the 3-command
+instructions for this above). Then, simply:
+
+```sh
+cd python/util/examples
+python rf_benchmark.py --csv_path /tmp/rf_benchmark_results.csv
+```
+
+The script will write to stdout the receptive field parameters for many variants
+of several popular convnets: AlexNet, VGG, ResNet, Inception, Mobilenet. They
+are also written to the file `/tmp/rf_benchmark_results.csv`.
+
+TODO: include here a plot for receptive field sizes of different convnets.
+
+TODO: include table/link to pre-computed RF parameters.
+
+## Compute RF parameters from a graph pbtxt
+
+We also provide a utility to compute the receptive field parameters directly
+from a graph protobuf file.
+
+Have a `graph.pbtxt` file and want to compute its receptive field parameters? We
+got you covered. The only prerequisite is to install
+[google/protobuf](https://github.com/google/protobuf), which you probably
+already have if you're using tensorflow (otherwise, follow installation
+instructions [here](https://github.com/google/protobuf/tree/master/python)).
+
+This should work:
+
+```sh
+cd python/util/examples
+python compute_rf.py \
+  --graph_path /path/to/graph.pbtxt \
+  --output_path /path/to/output/rf_info.txt \
+  --input_node my_input_node \
+  --output_node my_output_node
+```
+
+Don't know how to generate a graph protobuf file? Take a look at the
+`write_inception_resnet_v2_graph.py` script, which shows how to save it for the
+Inception-Resnet-v2 model:
+
+```sh
+cd python/util/examples
+python write_inception_resnet_v2_graph.py --graph_dir /tmp --graph_filename graph.pbtxt
+```
+
+This will write the Inception-Resnet-v2 graph protobuf to `/tmp/graph.pbtxt`.
+
+For completeness, here's how you would use this file to get the receptive field
+parameters of the Inception-Resnet-v2 model:
+
+```sh
+cd python/util/examples
+python compute_rf.py \
+  --graph_path /tmp/graph.pbtxt \
+  --output_path /tmp/rf_info.txt \
+  --input_node input_image \
+  --output_node InceptionResnetV2/Conv2d_7b_1x1/Relu
+```
+
+This will write the receptive field parameters of the model to
+`/tmp/rf_info.txt`, which will look like:
+
+```sh
+Receptive field size (horizontal) = 3039
+Receptive field size (vertical) = 3039
+Effective stride (horizontal) = 32
+Effective stride (vertical) = 32
+Effective padding (horizontal) = 1482
+Effective padding (vertical) = 1482
+```
+
+## Authors
+
+Andr&eacute; Araujo (github id: andrefaraujo) and Mark Sandler (github id:
+marksandler)
diff --git a/tensorflow/contrib/receptive_field/__init__.py b/tensorflow/contrib/receptive_field/__init__.py
new file mode 100644
index 00000000000..10745a6a53d
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module to compute receptive field parameters for CNN tensorflow models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.receptive_field.python.util.graph_compute_order import get_compute_order
+from tensorflow.contrib.receptive_field.python.util.receptive_field import compute_receptive_field_from_graph_def
+# pylint: enable=unused-import
diff --git a/tensorflow/contrib/receptive_field/python/__init__.py b/tensorflow/contrib/receptive_field/python/__init__.py
new file mode 100644
index 00000000000..217047f92d3
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module to compute receptive field parameters for CNN tensorflow models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
new file mode 100644
index 00000000000..1cf978b90a3
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
@@ -0,0 +1,94 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Computes Receptive Field (RF) information given a graph protobuf.
+
+For an example of usage, see accompanying file compute_rf.sh
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from google.protobuf import text_format
+
+from tensorflow.contrib import receptive_field
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.platform import app
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+
+cmd_args = None
+
+
+def _load_graphdef(path):
+  """Helper function to load GraphDef from file.
+
+  Args:
+    path: Path to pbtxt file.
+
+  Returns:
+    graph_def: A GraphDef object.
+  """
+  graph_def = graph_pb2.GraphDef()
+  pbstr = gfile.Open(path).read()
+  text_format.Parse(pbstr, graph_def)
+  return graph_def
+
+
+def main(unused_argv):
+
+  graph_def = _load_graphdef(cmd_args.graph_path)
+
+  (receptive_field_x, receptive_field_y, effective_stride_x, effective_stride_y,
+   effective_padding_x, effective_padding_y
+  ) = receptive_field.compute_receptive_field_from_graph_def(
+      graph_def, cmd_args.input_node, cmd_args.output_node)
+
+  logging.info('Receptive field size (horizontal) = %s', receptive_field_x)
+  logging.info('Receptive field size (vertical) = %s', receptive_field_y)
+  logging.info('Effective stride (horizontal) = %s', effective_stride_x)
+  logging.info('Effective stride (vertical) = %s', effective_stride_y)
+  logging.info('Effective padding (horizontal) = %s', effective_padding_x)
+  logging.info('Effective padding (vertical) = %s', effective_padding_y)
+
+  f = gfile.GFile('%s' % cmd_args.output_path, 'w')
+  f.write('Receptive field size (horizontal) = %s\n' % receptive_field_x)
+  f.write('Receptive field size (vertical) = %s\n' % receptive_field_y)
+  f.write('Effective stride (horizontal) = %s\n' % effective_stride_x)
+  f.write('Effective stride (vertical) = %s\n' % effective_stride_y)
+  f.write('Effective padding (horizontal) = %s\n' % effective_padding_x)
+  f.write('Effective padding (vertical) = %s\n' % effective_padding_y)
+  f.close()
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--graph_path', type=str, default='', help='Graph path (pbtxt format).')
+  parser.add_argument(
+      '--output_path',
+      type=str,
+      default='',
+      help='Path to output text file where RF information will be written to.')
+  parser.add_argument(
+      '--input_node', type=str, default='', help='Name of input node.')
+  parser.add_argument(
+      '--output_node', type=str, default='', help='Name of output node.')
+  cmd_args, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
new file mode 100644
index 00000000000..94228dfa61b
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
@@ -0,0 +1,460 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Computes Receptive Field (RF) information for different models.
+
+The receptive field (and related parameters) for the different models are
+printed to stdout, and may also optionally be written to a CSV file.
+
+For an example of usage, see rf_benchmark.sh
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import csv
+import sys
+
+from nets import alexnet
+from nets import inception
+from nets import mobilenet_v1
+from nets import resnet_v1
+from nets import resnet_v2
+from nets import vgg
+from tensorflow.contrib import framework
+from tensorflow.contrib import receptive_field
+from tensorflow.contrib import slim
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import app
+
+cmd_args = None
+
+# Input node name for all architectures.
+_INPUT_NODE = 'input_image'
+
+# Variants of different network architectures.
+
+# - resnet: different versions and sizes.
+_SUPPORTED_RESNET_VARIANTS = [
+    'resnet_v1_50', 'resnet_v1_101', 'resnet_v1_152', 'resnet_v1_200',
+    'resnet_v2_50', 'resnet_v2_101', 'resnet_v2_152', 'resnet_v2_200'
+]
+
+# - inception_resnet_v2: default, and version with SAME padding.
+_SUPPORTED_INCEPTIONRESNETV2_VARIANTS = [
+    'inception_resnet_v2', 'inception_resnet_v2-same'
+]
+
+# - inception_v2: default, and version with no separable conv.
+_SUPPORTED_INCEPTIONV2_VARIANTS = [
+    'inception_v2', 'inception_v2-no-separable-conv'
+]
+
+# - inception_v3: default version.
+_SUPPORTED_INCEPTIONV3_VARIANTS = ['inception_v3']
+
+# - inception_v4: default version.
+_SUPPORTED_INCEPTIONV4_VARIANTS = ['inception_v4']
+
+# - alexnet_v2: default version.
+_SUPPORTED_ALEXNETV2_VARIANTS = ['alexnet_v2']
+
+# - vgg: vgg_a (with 11 layers) and vgg_16 (version D).
+_SUPPORTED_VGG_VARIANTS = ['vgg_a', 'vgg_16']
+
+# - mobilenet_v1: 100% and 75%.
+_SUPPORTED_MOBILENETV1_VARIANTS = ['mobilenet_v1', 'mobilenet_v1_075']
+
+
+def _construct_model(model_type='resnet_v1_50'):
+  """Constructs model for the desired type of CNN.
+
+  Args:
+    model_type: Type of model to be used.
+
+  Returns:
+    end_points: A dictionary from components of the network to the corresponding
+      activations.
+
+  Raises:
+    ValueError: If the model_type is not supported.
+  """
+  # Placeholder input.
+  images = array_ops.placeholder(
+      dtypes.float32, shape=(1, None, None, 3), name=_INPUT_NODE)
+
+  # Construct model.
+  if model_type == 'inception_resnet_v2':
+    _, end_points = inception.inception_resnet_v2_base(images)
+  elif model_type == 'inception_resnet_v2-same':
+    _, end_points = inception.inception_resnet_v2_base(
+        images, align_feature_maps=True)
+  elif model_type == 'inception_v2':
+    _, end_points = inception.inception_v2_base(images)
+  elif model_type == 'inception_v2-no-separable-conv':
+    _, end_points = inception.inception_v2_base(
+        images, use_separable_conv=False)
+  elif model_type == 'inception_v3':
+    _, end_points = inception.inception_v3_base(images)
+  elif model_type == 'inception_v4':
+    _, end_points = inception.inception_v4_base(images)
+  elif model_type == 'alexnet_v2':
+    _, end_points = alexnet.alexnet_v2(images)
+  elif model_type == 'vgg_a':
+    _, end_points = vgg.vgg_a(images)
+  elif model_type == 'vgg_16':
+    _, end_points = vgg.vgg_16(images)
+  elif model_type == 'mobilenet_v1':
+    _, end_points = mobilenet_v1.mobilenet_v1_base(images)
+  elif model_type == 'mobilenet_v1_075':
+    _, end_points = mobilenet_v1.mobilenet_v1_base(
+        images, depth_multiplier=0.75)
+  elif model_type == 'resnet_v1_50':
+    _, end_points = resnet_v1.resnet_v1_50(
+        images, num_classes=None, is_training=False, global_pool=False)
+  elif model_type == 'resnet_v1_101':
+    _, end_points = resnet_v1.resnet_v1_101(
+        images, num_classes=None, is_training=False, global_pool=False)
+  elif model_type == 'resnet_v1_152':
+    _, end_points = resnet_v1.resnet_v1_152(
+        images, num_classes=None, is_training=False, global_pool=False)
+  elif model_type == 'resnet_v1_200':
+    _, end_points = resnet_v1.resnet_v1_200(
+        images, num_classes=None, is_training=False, global_pool=False)
+  elif model_type == 'resnet_v2_50':
+    _, end_points = resnet_v2.resnet_v2_50(
+        images, num_classes=None, is_training=False, global_pool=False)
+  elif model_type == 'resnet_v2_101':
+    _, end_points = resnet_v2.resnet_v2_101(
+        images, num_classes=None, is_training=False, global_pool=False)
+  elif model_type == 'resnet_v2_152':
+    _, end_points = resnet_v2.resnet_v2_152(
+        images, num_classes=None, is_training=False, global_pool=False)
+  elif model_type == 'resnet_v2_200':
+    _, end_points = resnet_v2.resnet_v2_200(
+        images, num_classes=None, is_training=False, global_pool=False)
+  else:
+    raise ValueError('Unsupported model_type %s.' % model_type)
+
+  return end_points
+
+
+def _get_desired_end_point_keys(model_type='resnet_v1_50'):
+  """Gets list of desired end point keys for a type of CNN.
+
+  Args:
+    model_type: Type of model to be used.
+
+  Returns:
+    desired_end_point_types: A list containing the desired end-points.
+
+  Raises:
+    ValueError: If the model_type is not supported.
+  """
+  if model_type in _SUPPORTED_RESNET_VARIANTS:
+    blocks = ['block1', 'block2', 'block3', 'block4']
+    desired_end_point_keys = ['%s/%s' % (model_type, i) for i in blocks]
+  elif model_type in _SUPPORTED_INCEPTIONRESNETV2_VARIANTS:
+    desired_end_point_keys = [
+        'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3',
+        'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b',
+        'Mixed_6a', 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1'
+    ]
+  elif model_type in _SUPPORTED_INCEPTIONV2_VARIANTS:
+    desired_end_point_keys = [
+        'Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3',
+        'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'Mixed_4a', 'Mixed_4b',
+        'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c'
+    ]
+  elif model_type in _SUPPORTED_INCEPTIONV3_VARIANTS:
+    desired_end_point_keys = [
+        'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3',
+        'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b',
+        'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d',
+        'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c'
+    ]
+  elif model_type in _SUPPORTED_INCEPTIONV4_VARIANTS:
+    desired_end_point_keys = [
+        'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a',
+        'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_5e',
+        'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_6f',
+        'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c', 'Mixed_7d'
+    ]
+  elif model_type in _SUPPORTED_ALEXNETV2_VARIANTS:
+    ep = ['conv1', 'pool1', 'conv2', 'conv3', 'conv4', 'conv5', 'pool5']
+    desired_end_point_keys = ['%s/%s' % (model_type, i) for i in ep]
+  elif model_type in _SUPPORTED_VGG_VARIANTS:
+    ep = [
+        'conv1/conv1_1', 'pool1', 'conv2/conv2_1', 'pool2', 'conv3/conv3_1',
+        'conv3/conv3_2', 'pool3', 'conv4/conv4_1', 'conv4/conv4_2', 'pool4',
+        'conv5/conv5_1', 'conv5/conv5_2', 'pool5'
+    ]
+    desired_end_point_keys = ['%s/%s' % (model_type, i) for i in ep]
+  elif model_type in _SUPPORTED_MOBILENETV1_VARIANTS:
+    desired_end_point_keys = [
+        'Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise',
+        'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5_pointwise',
+        'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise',
+        'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise',
+        'Conv2d_12_pointwise', 'Conv2d_13_pointwise'
+    ]
+  else:
+    raise ValueError('Unsupported model_type %s.' % model_type)
+
+  return desired_end_point_keys
+
+
+def _model_graph_def(model_type='resnet_v1_50', arg_sc=None):
+  """Constructs a model graph, returning GraphDef and end-points.
+
+  Args:
+    model_type: Type of model to be used.
+    arg_sc: Optional arg scope to use in constructing the graph.
+
+  Returns:
+    graph_def: GraphDef of constructed graph.
+    end_points: A dictionary from components of the network to the corresponding
+      activations.
+  """
+  if arg_sc is None:
+    arg_sc = {}
+  g = ops.Graph()
+  with g.as_default():
+    with framework.arg_scope(arg_sc):
+      end_points = _construct_model(model_type)
+
+  return g.as_graph_def(), end_points
+
+
+def _model_rf(graphdef,
+              end_points,
+              desired_end_point_keys,
+              model_type='resnet_v1_50',
+              csv_writer=None):
+  """Computes receptive field information for a given CNN model.
+
+  The information will be printed to stdout. If the RF parameters are the same
+  for the horizontal and vertical directions, it will be printed only once.
+  Otherwise, they are printed once for the horizontal and once for the vertical
+  directions.
+
+  Args:
+    graphdef: GraphDef of given model.
+    end_points: A dictionary from components of the model to the corresponding
+      activations.
+    desired_end_point_keys: List of desired end points for which receptive field
+      information will be computed.
+    model_type: Type of model to be used, used only for printing purposes.
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for desired_end_point_key in desired_end_point_keys:
+    print('- %s:' % desired_end_point_key)
+    output_node_with_colon = end_points[desired_end_point_key].name
+    pos = output_node_with_colon.rfind(':')
+    output_node = output_node_with_colon[:pos]
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y
+    ) = receptive_field.compute_receptive_field_from_graph_def(
+        graphdef, _INPUT_NODE, output_node)
+    # If values are the same in horizontal/vertical directions, just report one
+    # of them. Otherwise, report both.
+    if (receptive_field_x == receptive_field_y) and (
+        effective_stride_x == effective_stride_y) and (
+            effective_padding_x == effective_padding_y):
+      print('Receptive field size = %5s, effective stride = %5s, effective '
+            'padding = %5s' % (str(receptive_field_x), str(effective_stride_x),
+                               str(effective_padding_x)))
+    else:
+      print('Receptive field size: horizontal = %5s, vertical = %5s. '
+            'Effective stride: horizontal = %5s, vertical = %5s. Effective '
+            'padding: horizontal = %5s, vertical = %5s' %
+            (str(receptive_field_x), str(receptive_field_y),
+             str(effective_stride_x), str(effective_stride_y),
+             str(effective_padding_x), str(effective_padding_y)))
+    if csv_writer is not None:
+      csv_writer.writerow({
+          'CNN': model_type,
+          'end_point': desired_end_point_key,
+          'RF size hor': str(receptive_field_x),
+          'RF size ver': str(receptive_field_y),
+          'effective stride hor': str(effective_stride_x),
+          'effective stride ver': str(effective_stride_y),
+          'effective padding hor': str(effective_padding_x),
+          'effective padding ver': str(effective_padding_y)
+      })
+
+
+def _process_model_rf(model_type='resnet_v1_50', csv_writer=None, arg_sc=None):
+  """Contructs model graph and desired end-points, and compute RF.
+
+  The computed RF parameters are printed to stdout by the _model_rf function.
+
+  Args:
+    model_type: Type of model to be used.
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+    arg_sc: Optional arg scope to use in constructing the graph.
+
+  """
+  print('********************%s' % model_type)
+  graphdef, end_points = _model_graph_def(model_type, arg_sc)
+  desired_end_point_keys = _get_desired_end_point_keys(model_type)
+  _model_rf(graphdef, end_points, desired_end_point_keys, model_type,
+            csv_writer)
+
+
+def _resnet_rf(csv_writer=None):
+  """Computes RF and associated parameters for resnet models.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_RESNET_VARIANTS:
+    arg_sc = resnet_v1.resnet_arg_scope()
+    _process_model_rf(model_type, csv_writer, arg_sc)
+
+
+def _inception_resnet_v2_rf(csv_writer=None):
+  """Computes RF and associated parameters for the inception_resnet_v2 model.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_INCEPTIONRESNETV2_VARIANTS:
+    _process_model_rf(model_type, csv_writer)
+
+
+def _inception_v2_rf(csv_writer=None):
+  """Computes RF and associated parameters for the inception_v2 model.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_INCEPTIONV2_VARIANTS:
+    _process_model_rf(model_type, csv_writer)
+
+
+def _inception_v3_rf(csv_writer=None):
+  """Computes RF and associated parameters for the inception_v3 model.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_INCEPTIONV3_VARIANTS:
+    _process_model_rf(model_type, csv_writer)
+
+
+def _inception_v4_rf(csv_writer=None):
+  """Computes RF and associated parameters for the inception_v4 model.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_INCEPTIONV4_VARIANTS:
+    _process_model_rf(model_type, csv_writer)
+
+
+def _alexnet_v2_rf(csv_writer=None):
+  """Computes RF and associated parameters for the alexnet_v2 model.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_ALEXNETV2_VARIANTS:
+    _process_model_rf(model_type, csv_writer)
+
+
+def _vgg_rf(csv_writer=None):
+  """Computes RF and associated parameters for the vgg model.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_VGG_VARIANTS:
+    _process_model_rf(model_type, csv_writer)
+
+
+def _mobilenet_v1_rf(csv_writer=None):
+  """Computes RF and associated parameters for the mobilenet_v1 model.
+
+  The computed values are written to stdout.
+
+  Args:
+    csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+  """
+  for model_type in _SUPPORTED_MOBILENETV1_VARIANTS:
+    with slim.arg_scope(
+        [slim.batch_norm, slim.dropout], is_training=False) as arg_sc:
+      _process_model_rf(model_type, csv_writer, arg_sc)
+
+
+def main(unused_argv):
+  # Configure CSV file which will be written, if desired.
+  if cmd_args.csv_path:
+    csv_file = open(cmd_args.csv_path, 'w')
+    field_names = [
+        'CNN', 'end_point', 'RF size hor', 'RF size ver',
+        'effective stride hor', 'effective stride ver', 'effective padding hor',
+        'effective padding ver'
+    ]
+    rf_writer = csv.DictWriter(csv_file, fieldnames=field_names)
+    rf_writer.writeheader()
+  else:
+    rf_writer = None
+
+  # Compute RF parameters for each network architecture.
+  _alexnet_v2_rf(rf_writer)
+  _vgg_rf(rf_writer)
+  _inception_v2_rf(rf_writer)
+  _inception_v3_rf(rf_writer)
+  _inception_v4_rf(rf_writer)
+  _inception_resnet_v2_rf(rf_writer)
+  _mobilenet_v1_rf(rf_writer)
+  _resnet_rf(rf_writer)
+
+  # Close CSV file, if it was opened.
+  if cmd_args.csv_path:
+    csv_file.close()
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--csv_path',
+      type=str,
+      default='',
+      help="""\
+      Path to CSV file that will be written with RF parameters.If empty, no
+      file will be written.\
+      """)
+  cmd_args, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py b/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py
new file mode 100644
index 00000000000..793ae163d80
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py
@@ -0,0 +1,61 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Simple script to write Inception-ResNet-v2 model to graph file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from nets import inception
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_io
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import app
+
+cmd_args = None
+
+
+def main(unused_argv):
+  # Model definition.
+  g = ops.Graph()
+  with g.as_default():
+    images = array_ops.placeholder(
+        dtypes.float32, shape=(1, None, None, 3), name='input_image')
+    inception.inception_resnet_v2_base(images)
+
+  graph_io.write_graph(g.as_graph_def(), cmd_args.graph_dir,
+                       cmd_args.graph_filename)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--graph_dir',
+      type=str,
+      default='/tmp',
+      help='Directory where graph will be saved.')
+  parser.add_argument(
+      '--graph_filename',
+      type=str,
+      default='graph.pbtxt',
+      help='Filename of graph that will be saved.')
+  cmd_args, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
new file mode 100644
index 00000000000..8af4be16d6c
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
@@ -0,0 +1,88 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library to compute order of computations in a graph.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+class GraphDefHelper(object):
+  """Helper class to collect node names and definitions.
+
+  Example:
+    b = GraphDefHelper(graph_def)
+    # Prints node that produces given output.
+    print b.output_of['conv/foo/bar']
+  """
+
+  def __init__(self, gd):
+    self.output_of = {}
+    for each in gd.node:
+      self.output_of[each.name] = each
+
+
+# pylint: disable=invalid-name
+_NodeEntry = collections.namedtuple('NodeEntry', field_names=['order', 'node'])
+
+
+def _get_computed_nodes(g, output, seen):
+  """Traverses the graph in topological order.
+
+  Args:
+    g: GraphDefHelper object.
+    output: current node.
+    seen: map of nodes we've already traversed.
+  Returns:
+    order in topological sort for 'output'.
+  """
+  if output in seen:
+    return seen[output].order
+  node_def = g.output_of.get(output, None)
+  if node_def is None:
+    seen[output] = _NodeEntry(0, None)
+    return 0
+
+  r = 0
+  for each in node_def.input:
+    # Parses name of input node.
+    if each.startswith('^'):
+      each = each[1:]
+    each = each.split(':')[0]
+    # Recursively computes ordering.
+    new_v = _get_computed_nodes(g, each, seen)
+    r = max(r, new_v + 1)
+
+  seen[output] = _NodeEntry(r, node_def)
+
+  return seen[output].order
+
+
+def get_compute_order(graph_def):
+  """Computes order of computation for a given graph.
+
+  Args:
+    graph_def: GraphDef object.
+  Returns:
+    map: name -> {order, node}
+  """
+  helper = GraphDefHelper(graph_def)
+  seen = collections.defaultdict(_NodeEntry)
+  for each in graph_def.node:
+    _get_computed_nodes(helper, each.name, seen)
+  return seen
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field.py b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
new file mode 100644
index 00000000000..db190a1a416
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
@@ -0,0 +1,485 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to compute receptive field of a fully-convolutional network.
+
+Please refer to the following g3doc for detailed explanation on how this
+computation is performed, and why it is important:
+g3doc/photos/vision/features/delf/g3doc/rf_computation.md
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from tensorflow.contrib.receptive_field.python.util import graph_compute_order
+from tensorflow.contrib.util import make_ndarray
+from tensorflow.python.platform import tf_logging as logging
+
+# White-listed layer operations, which do not affect the receptive field
+# computation.
+_UNCHANGED_RF_LAYER_OPS = [
+    "Softplus", "Relu", "BiasAdd", "Mul", "Add", "Const", "Identity",
+    "VariableV2", "Sub", "Rsqrt", "ConcatV2"
+]
+
+# Different ways in which padding modes may be spelled.
+_VALID_PADDING = ["VALID", b"VALID"]
+_SAME_PADDING = ["SAME", b"SAME"]
+
+
+def _stride_size(node):
+  """Computes stride size given a TF node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+
+  Returns:
+    stride_x: Stride size for horizontal direction (integer).
+    stride_y: Stride size for vertical direction (integer).
+  """
+  strides_attr = node.attr["strides"]
+  logging.vlog(4, "strides_attr = %s", strides_attr)
+  stride_y = strides_attr.list.i[1]
+  stride_x = strides_attr.list.i[2]
+  return stride_x, stride_y
+
+
+def _conv_kernel_size(node, name_to_order_node):
+  """Computes kernel size given a TF convolution or pooling node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_order_node: Map from name to {order, node}. Output of
+      graph_compute_order.get_compute_order().
+
+  Returns:
+    kernel_size_x: Kernel size for horizontal direction (integer).
+    kernel_size_y: Kernel size for vertical direction (integer).
+
+  Raises:
+    ValueError: If the weight layer node is invalid.
+  """
+  weights_layer_read_name = node.input[1]
+  if not weights_layer_read_name.endswith("/read"):
+    raise ValueError(
+        "Weight layer's name input to conv layer does not end with '/read'")
+  weights_layer_param_name = weights_layer_read_name[:-5]
+  weights_node = name_to_order_node[weights_layer_param_name].node
+  if weights_node.op != "VariableV2":
+    raise ValueError("Weight layer is not of type VariableV2")
+  shape = weights_node.attr["shape"]
+  logging.vlog(4, "weight shape = %s", shape)
+  kernel_size_y = shape.shape.dim[0].size
+  kernel_size_x = shape.shape.dim[1].size
+  return kernel_size_x, kernel_size_y
+
+
+def _padding_size_conv_pool(node, kernel_size, stride):
+  """Computes padding size given a TF convolution or pooling node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    kernel_size: Kernel size of node (integer).
+    stride: Stride size of node (integer).
+
+  Returns:
+    padding: Padding size (integer).
+
+  Raises:
+    ValueError: If padding is invalid.
+  """
+  # In this case, we need to carefully consider the different TF padding modes.
+  # The padding depends on kernel size, and may depend on input size. If it
+  # depends on input size, we raise an exception.
+  padding_attr = node.attr["padding"]
+  logging.vlog(4, "padding_attr = %s", padding_attr)
+  if padding_attr.s in _VALID_PADDING:
+    padding = 0
+  elif padding_attr.s in _SAME_PADDING:
+    if kernel_size == 1:
+      padding = 0
+    elif stride == 1:
+      padding = int(math.floor((float(kernel_size) - 1) / 2))
+    elif stride == 2 and kernel_size % 2 == 0:
+      padding = int(math.floor((float(kernel_size) - 1) / 2))
+    else:
+      padding = None
+      logging.warning(
+          "Padding depends on input size, which means that the effective "
+          "padding may be different depending on the input image "
+          "dimensionality. In this case, alignment check will be skipped.")
+  else:
+    raise ValueError("Invalid padding operation %s" % padding_attr.s)
+  return padding
+
+
+def _pool_kernel_size(node):
+  """Computes kernel size given a TF pooling node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+
+  Returns:
+    kernel_size_x: Kernel size for horizontal direction (integer).
+    kernel_size_y: Kernel size for vertical direction (integer).
+
+  Raises:
+    ValueError: If pooling is invalid.
+  """
+  ksize = node.attr["ksize"]
+  kernel_size_y = ksize.list.i[1]
+  kernel_size_x = ksize.list.i[2]
+  if ksize.list.i[0] != 1:
+    raise ValueError("pool ksize for first dim is not 1")
+  if ksize.list.i[3] != 1:
+    raise ValueError("pool ksize for last dim is not 1")
+  return kernel_size_x, kernel_size_y
+
+
+def _padding_size_pad_layer(node, name_to_order_node):
+  """Computes padding size given a TF padding node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_order_node: Map from name to {order, node}. Output of
+      graph_compute_order.get_compute_order().
+
+  Returns:
+    padding_x: Padding size for horizontal direction (integer).
+    padding_y: Padding size for vertical direction (integer).
+
+  Raises:
+    ValueError: If padding layer is invalid.
+  """
+  paddings_layer_name = node.input[1]
+  if not paddings_layer_name.endswith("/paddings"):
+    raise ValueError("Padding layer name does not end with '/paddings'")
+  paddings_node = name_to_order_node[paddings_layer_name].node
+  if paddings_node.op != "Const":
+    raise ValueError("Padding op is not Const")
+  value = paddings_node.attr["value"]
+  t = make_ndarray(value.tensor)
+  padding_y = t[1][0]
+  padding_x = t[2][0]
+  if t[0][0] != 0:
+    raise ValueError("padding is not zero for first tensor dim")
+  if t[3][0] != 0:
+    raise ValueError("padding is not zero for last tensor dim")
+  return padding_x, padding_y
+
+
+def _get_layer_params(node, name_to_order_node):
+  """Gets layer parameters relevant for RF computation.
+
+  Currently, only these nodes are supported:
+  - Conv2D
+  - DepthwiseConv2dNative
+  - Pad
+  - MaxPool
+  - AvgPool
+  - all nodes listed in _UNCHANGED_RF_LAYER_OPS
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_order_node: Map from name to {order, node}. Output of
+      graph_compute_order.get_compute_order().
+
+  Returns:
+    kernel_size_x: Kernel size for horizontal direction (integer).
+    kernel_size_y: Kernel size for vertical direction (integer).
+    stride_x: Stride size for horizontal direction (integer).
+    stride_y: Stride size for vertical direction (integer).
+    padding_x: Padding size for horizontal direction (integer).
+    padding_y: Padding size for vertical direction (integer).
+
+  Raises:
+    ValueError: If layer op is unknown.
+  """
+  logging.vlog(3, "node.op = %s", node.op)
+  logging.vlog(4, "node = %s", node)
+  if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
+    stride_x, stride_y = _stride_size(node)
+    kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_order_node)
+    # Compute the padding for this node separately for each direction.
+    padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
+    padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
+  elif node.op == "Pad":
+    # Kernel and stride are simply 1 in this case.
+    kernel_size_x = 1
+    kernel_size_y = 1
+    stride_x = 1
+    stride_y = 1
+    padding_x, padding_y = _padding_size_pad_layer(node, name_to_order_node)
+  elif node.op == "MaxPool" or node.op == "AvgPool":
+    stride_x, stride_y = _stride_size(node)
+    kernel_size_x, kernel_size_y = _pool_kernel_size(node)
+    # Compute the padding for this node separately for each direction.
+    padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
+    padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
+  elif node.op in _UNCHANGED_RF_LAYER_OPS:
+    # These nodes do not modify the RF parameters.
+    kernel_size_x = 1
+    kernel_size_y = 1
+    stride_x = 1
+    stride_y = 1
+    padding_x = 0
+    padding_y = 0
+  else:
+    raise ValueError("Unknown layer op: %s" % node.op)
+  return kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y
+
+
+def _reverse_sort_by_order(name_to_order_node):
+  """Sorts map of name_to_order_node nodes in reverse order.
+
+  The output is such that the nodes in name_to_order_node are sorted in
+  descending order of the "order" field.
+
+  Args:
+    name_to_order_node: Map from name to {order, node}. Output of
+      graph_compute_order.get_compute_order().
+
+  Returns:
+    sorted_name_to_order_node: Sorted version of the input, in descending order.
+  """
+  return sorted(name_to_order_node.items(), key=lambda x: -x[1].order)
+
+
+def _get_rf_size_node_input(stride, kernel_size, rf_size_output):
+  """Computes RF size at the input of a given layer.
+
+  Args:
+    stride: Stride of given layer (integer).
+    kernel_size: Kernel size of given layer (integer).
+    rf_size_output: RF size at output of given layer (integer).
+
+  Returns:
+    rf_size_input: RF size at input of given layer (integer).
+  """
+  return stride * rf_size_output + kernel_size - stride
+
+
+def _get_effective_stride_node_input(stride, effective_stride_output):
+  """Computes effective stride at the input of a given layer.
+
+  Args:
+    stride: Stride of given layer (integer).
+    effective_stride_output: Effective stride at output of given layer
+      (integer).
+
+  Returns:
+    effective_stride_input: Effective stride at input of given layer
+      (integer).
+  """
+  return stride * effective_stride_output
+
+
+def _get_effective_padding_node_input(stride, padding,
+                                      effective_padding_output):
+  """Computes effective padding at the input of a given layer.
+
+  Args:
+    stride: Stride of given layer (integer).
+    padding: Padding of given layer (integer).
+    effective_padding_output: Effective padding at output of given layer
+      (integer).
+
+  Returns:
+    effective_padding_input: Effective padding at input of given layer
+      (integer).
+  """
+  return stride * effective_padding_output + padding
+
+
+def compute_receptive_field_from_graph_def(graph_def, input_node, output_node):
+  """Computes receptive field (RF) parameters from a GraphDef object.
+
+  Args:
+    graph_def: GraphDef object.
+    input_node: Name of the input node from graph.
+    output_node: Name of the output node from graph.
+
+  Returns:
+    rf_size_x: Receptive field size of network in the horizontal direction, with
+      respect to specified input and output.
+    rf_size_y: Receptive field size of network in the vertical direction, with
+      respect to specified input and output.
+    effective_stride_x: Effective stride of network in the horizontal direction,
+      with respect to specified input and output.
+    effective_stride_y: Effective stride of network in the vertical direction,
+      with respect to specified input and output.
+    effective_padding_x: Effective padding of network in the horizontal
+      direction, with respect to specified input and output.
+    effective_padding_y: Effective padding of network in the vertical
+      direction, with respect to specified input and output.
+
+  Raises:
+    ValueError: If network is not aligned or if either input or output nodes
+      cannot be found. For network criterion alignment, see
+      photos/vision/features/delf/g3doc/rf_computation.md
+  """
+  # Computes order of computation for a given graph.
+  name_to_order_node = graph_compute_order.get_compute_order(
+      graph_def=graph_def)
+
+  # Sort in reverse topological order.
+  order = _reverse_sort_by_order(name_to_order_node)
+
+  # Dictionaries to keep track of receptive field, effective stride and
+  # effective padding of different nodes.
+  rf_sizes_x = {}
+  rf_sizes_y = {}
+  effective_strides_x = {}
+  effective_strides_y = {}
+  effective_paddings_x = {}
+  effective_paddings_y = {}
+
+  # Initialize dicts for output_node.
+  rf_sizes_x[output_node] = 1
+  rf_sizes_y[output_node] = 1
+  effective_strides_x[output_node] = 1
+  effective_strides_y[output_node] = 1
+  effective_paddings_x[output_node] = 0
+  effective_paddings_y[output_node] = 0
+
+  # Flag to denote if we found output node yet. If we have not, we skip nodes
+  # until the output node is found.
+  found_output_node = False
+
+  # Flag to denote if padding is undefined. This happens when SAME padding mode
+  # is used in conjunction with stride and kernel sizes which make it such that
+  # the padding to be applied would depend on the input size. In this case,
+  # alignment checks are skipped, and the effective padding is None.
+  undefined_padding = False
+
+  for _, (o, node) in order:
+    if node:
+      logging.vlog(3, "%10d %-100s %-20s" % (o, node.name[:90], node.op))
+    else:
+      continue
+
+    # When we find input node, we can stop.
+    if node.name == input_node:
+      break
+
+    # Loop until we find the output node. All nodes before finding the output
+    # one are irrelevant, so they can be skipped.
+    if not found_output_node:
+      if node.name == output_node:
+        found_output_node = True
+
+    if found_output_node:
+      if node.name not in rf_sizes_x:
+        assert node.name not in rf_sizes_y, ("Node %s is in rf_sizes_y, but "
+                                             "not in rf_sizes_x" % node.name)
+        # In this case, node is not relevant since it's not part of the
+        # computation we're interested in.
+        logging.vlog(3, "Irrelevant node %s, skipping it...", node.name)
+        continue
+
+      # Get params for this layer.
+      kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y = (
+          _get_layer_params(node, name_to_order_node))
+      logging.vlog(3, "kernel_size_x = %s, kernel_size_y = %s, "
+                   "stride_x = %s, stride_y = %s, "
+                   "padding_x = %s, padding_y = %s" %
+                   (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
+                    padding_y))
+      if padding_x is None or padding_y is None:
+        undefined_padding = True
+
+      # Get parameters at input of this layer which may or may not be propagated
+      # to the input layers.
+      rf_size_input_x = _get_rf_size_node_input(stride_x, kernel_size_x,
+                                                rf_sizes_x[node.name])
+      rf_size_input_y = _get_rf_size_node_input(stride_y, kernel_size_y,
+                                                rf_sizes_y[node.name])
+      effective_stride_input_x = _get_effective_stride_node_input(
+          stride_x, effective_strides_x[node.name])
+      effective_stride_input_y = _get_effective_stride_node_input(
+          stride_y, effective_strides_y[node.name])
+      if not undefined_padding:
+        effective_padding_input_x = _get_effective_padding_node_input(
+            stride_x, padding_x, effective_paddings_x[node.name])
+        effective_padding_input_y = _get_effective_padding_node_input(
+            stride_y, padding_y, effective_paddings_y[node.name])
+      else:
+        effective_padding_input_x = None
+        effective_padding_input_y = None
+
+      # Loop over this node's inputs and potentially propagate information down.
+      for inp_name in node.input:
+        logging.vlog(4, "inp_name = %s", inp_name)
+        inp_node = name_to_order_node[inp_name].node
+        logging.vlog(4, "inp_node = \n%s", inp_node)
+        if inp_node.name in rf_sizes_x:
+          assert inp_node.name in rf_sizes_y, (
+              "Node %s is in rf_sizes_x, but "
+              "not in rf_sizes_y" % inp_node.name)
+          # This node was already discovered through a previous path, so we need
+          # to make sure that graph is aligned. This alignment check is skipped
+          # if the padding is not defined, since in this case alignment cannot
+          # be checked.
+          if not undefined_padding:
+            if effective_strides_x[inp_node.name] != effective_stride_input_x:
+              raise ValueError(
+                  "Graph is not aligned since effective stride from different "
+                  "paths is different in horizontal direction")
+            if effective_strides_y[inp_node.name] != effective_stride_input_y:
+              raise ValueError(
+                  "Graph is not aligned since effective stride from different "
+                  "paths is different in vertical direction")
+            if (rf_sizes_x[inp_node.name] - 1
+               ) / 2 - effective_paddings_x[inp_node.name] != (
+                   rf_size_input_x - 1) / 2 - effective_padding_input_x:
+              raise ValueError(
+                  "Graph is not aligned since center shift from different "
+                  "paths is different in horizontal direction")
+            if (rf_sizes_y[inp_node.name] - 1
+               ) / 2 - effective_paddings_y[inp_node.name] != (
+                   rf_size_input_y - 1) / 2 - effective_padding_input_y:
+              raise ValueError(
+                  "Graph is not aligned since center shift from different "
+                  "paths is different in vertical direction")
+          # Keep track of path with largest RF, for both directions.
+          if rf_sizes_x[inp_node.name] < rf_size_input_x:
+            rf_sizes_x[inp_node.name] = rf_size_input_x
+            effective_strides_x[inp_node.name] = effective_stride_input_x
+            effective_paddings_x[inp_node.name] = effective_padding_input_x
+          if rf_sizes_y[inp_node.name] < rf_size_input_y:
+            rf_sizes_y[inp_node.name] = rf_size_input_y
+            effective_strides_y[inp_node.name] = effective_stride_input_y
+            effective_paddings_y[inp_node.name] = effective_padding_input_y
+        else:
+          assert inp_node.name not in rf_sizes_y, (
+              "Node %s is in rf_sizes_y, but "
+              "not in rf_sizes_x" % inp_node.name)
+          # In this case, it is the first time we encounter this node. So we
+          # propagate the RF parameters.
+          rf_sizes_x[inp_node.name] = rf_size_input_x
+          rf_sizes_y[inp_node.name] = rf_size_input_y
+          effective_strides_x[inp_node.name] = effective_stride_input_x
+          effective_strides_y[inp_node.name] = effective_stride_input_y
+          effective_paddings_x[inp_node.name] = effective_padding_input_x
+          effective_paddings_y[inp_node.name] = effective_padding_input_y
+
+  if not found_output_node:
+    raise ValueError("Output node was not found")
+  if input_node not in rf_sizes_x:
+    raise ValueError("Input node was not found")
+  return (rf_sizes_x[input_node], rf_sizes_y[input_node],
+          effective_strides_x[input_node], effective_strides_y[input_node],
+          effective_paddings_x[input_node], effective_paddings_y[input_node])
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
new file mode 100644
index 00000000000..2771389250b
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
@@ -0,0 +1,225 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for receptive_fields module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import slim
+from tensorflow.contrib.receptive_field.python.util import receptive_field
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+def create_test_network_1():
+  """Aligned network for test.
+
+  The graph corresponds to the example from the second figure in
+  go/cnn-rf-computation#arbitrary-computation-graphs
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An 8x8 test image.
+    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # Left branch.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
+    # Right branch.
+    l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]])
+    l2 = slim.conv2d(l2_pad, 1, [3, 3], stride=2, scope='L2', padding='VALID')
+    l3 = slim.conv2d(l2, 1, [1, 1], stride=2, scope='L3', padding='VALID')
+    # Addition.
+    nn.relu(l1 + l3, name='output')
+  return g
+
+
+def create_test_network_2():
+  """Aligned network for test.
+
+  The graph corresponds to a variation to the example from the second figure in
+  go/cnn-rf-computation#arbitrary-computation-graphs. Layers 2 and 3 are changed
+  to max-pooling operations. Since the functionality is the same as convolution,
+  the network is aligned and the receptive field size is the same as from the
+  network created using create_test_network_1().
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An 8x8 test image.
+    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # Left branch.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
+    # Right branch.
+    l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]])
+    l2 = slim.max_pool2d(l2_pad, [3, 3], stride=2, scope='L2', padding='VALID')
+    l3 = slim.max_pool2d(l2, [1, 1], stride=2, scope='L3', padding='VALID')
+    # Addition.
+    nn.relu(l1 + l3, name='output')
+  return g
+
+
+def create_test_network_3():
+  """Misaligned network for test.
+
+  The graph corresponds to the example from the first figure in
+  go/cnn-rf-computation#arbitrary-computation-graphs
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An 8x8 test image.
+    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # Left branch.
+    l1_pad = array_ops.pad(x, [[0, 0], [2, 1], [2, 1], [0, 0]])
+    l1 = slim.conv2d(l1_pad, 1, [5, 5], stride=2, scope='L1', padding='VALID')
+    # Right branch.
+    l2 = slim.conv2d(x, 1, [3, 3], stride=1, scope='L2', padding='VALID')
+    l3 = slim.conv2d(l2, 1, [3, 3], stride=1, scope='L3', padding='VALID')
+    # Addition.
+    nn.relu(l1 + l3, name='output')
+  return g
+
+
+def create_test_network_4():
+  """Misaligned network for test.
+
+  The graph corresponds to a variation from the example from the second figure
+  in go/cnn-rf-computation#arbitrary-computation-graphs. Layer 2 uses 'SAME'
+  padding, which makes its padding dependent on the input image dimensionality.
+  In this case, the effective padding will be undetermined, and the utility is
+  not able to check the network alignment.
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An 8x8 test image.
+    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # Left branch.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
+    # Right branch.
+    l2 = slim.conv2d(x, 1, [3, 3], stride=2, scope='L2', padding='SAME')
+    l3 = slim.conv2d(l2, 1, [1, 1], stride=2, scope='L3', padding='VALID')
+    # Addition.
+    nn.relu(l1 + l3, name='output')
+  return g
+
+
+def create_test_network_5():
+  """Single-path network for testing non-square kernels.
+
+  The graph is similar to the right branch of the graph from
+  create_test_network_1(), except that the kernel sizes are changed to be
+  non-square.
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An 8x8 test image.
+    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # Two convolutional layers, where the first one has non-square kernel.
+    l1 = slim.conv2d(x, 1, [3, 5], stride=2, scope='L1', padding='VALID')
+    l2 = slim.conv2d(l1, 1, [3, 1], stride=2, scope='L2', padding='VALID')
+    # ReLU.
+    nn.relu(l2, name='output')
+  return g
+
+
+class RfUtilsTest(test.TestCase):
+
+  def testComputeRFFromGraphDefAligned(self):
+    graph_def = create_test_network_1().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node))
+    self.assertEqual(receptive_field_x, 3)
+    self.assertEqual(receptive_field_y, 3)
+    self.assertEqual(effective_stride_x, 4)
+    self.assertEqual(effective_stride_y, 4)
+    self.assertEqual(effective_padding_x, 1)
+    self.assertEqual(effective_padding_y, 1)
+
+  def testComputeRFFromGraphDefAligned2(self):
+    graph_def = create_test_network_2().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node))
+    self.assertEqual(receptive_field_x, 3)
+    self.assertEqual(receptive_field_y, 3)
+    self.assertEqual(effective_stride_x, 4)
+    self.assertEqual(effective_stride_y, 4)
+    self.assertEqual(effective_padding_x, 1)
+    self.assertEqual(effective_padding_y, 1)
+
+  def testComputeRFFromGraphDefUnaligned(self):
+    graph_def = create_test_network_3().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    with self.assertRaises(ValueError):
+      receptive_field.compute_receptive_field_from_graph_def(
+          graph_def, input_node, output_node)
+
+  def testComputeRFFromGraphDefUnaligned2(self):
+    graph_def = create_test_network_4().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node))
+    self.assertEqual(receptive_field_x, 3)
+    self.assertEqual(receptive_field_y, 3)
+    self.assertEqual(effective_stride_x, 4)
+    self.assertEqual(effective_stride_y, 4)
+    self.assertEqual(effective_padding_x, None)
+    self.assertEqual(effective_padding_y, None)
+
+  def testComputeRFFromGraphDefNonSquareRF(self):
+    graph_def = create_test_network_5().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node))
+    self.assertEqual(receptive_field_x, 5)
+    self.assertEqual(receptive_field_y, 7)
+    self.assertEqual(effective_stride_x, 4)
+    self.assertEqual(effective_stride_y, 4)
+    self.assertEqual(effective_padding_x, 0)
+    self.assertEqual(effective_padding_y, 0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index 598e6513aeb..865fb72a55b 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -21,6 +21,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":stateless_random_ops",
+        "//tensorflow/python:framework",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
index 82e5d36ce44..ca937546f50 100644
--- a/tensorflow/contrib/stateless/__init__.py
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -34,5 +34,11 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.contrib.stateless.gen_stateless_random_ops import *
 
+from tensorflow.python.framework import ops
 from tensorflow.python.util.all_util import remove_undocumented
+
+ops.NotDifferentiable("StatelessRandomNormal")
+ops.NotDifferentiable("StatelessRandomUniform")
+ops.NotDifferentiable("StatelessTruncatedNormal")
+
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
new file mode 100644
index 00000000000..bc305022642
--- /dev/null
+++ b/tensorflow/contrib/summary/BUILD
@@ -0,0 +1,59 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files([
+    "LICENSE",
+])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+    "tf_gen_op_wrapper_py",
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_summary_ops",
+    out = "gen_summary_ops.py",
+    deps = ["//tensorflow/core:summary_ops_op_lib"],
+)
+
+py_test(
+    name = "summary_ops_test",
+    srcs = ["summary_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":summary_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "summary_ops",
+    srcs = ["summary_ops.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":gen_summary_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_op_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
new file mode 100644
index 00000000000..05e627adf1c
--- /dev/null
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -0,0 +1,159 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Operations to emit summaries."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.summary import gen_summary_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import summary_op_util
+from tensorflow.python.training import training_util
+
+
+# Name for a collection which is expected to have at most a single boolean
+# Tensor. If this tensor is True the summary ops will record summaries.
+_SHOULD_RECORD_SUMMARIES_NAME = "ShouldRecordSummaries"
+
+
+def should_record_summaries():
+  """Returns boolean Tensor which is true if summaries should be recorded."""
+  should_record_collection = ops.get_collection(_SHOULD_RECORD_SUMMARIES_NAME)
+  if not should_record_collection:
+    return constant_op.constant(False)
+  if len(should_record_collection) != 1:
+    raise ValueError(
+        "More than one tensor specified for whether summaries "
+        "should be recorded: %s" % should_record_collection)
+  return should_record_collection[0]
+
+
+# TODO(apassos) consider how to handle local step here.
+def record_summaries_every_n_global_steps(n):
+  """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
+  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
+  collection_ref[:] = [training_util.get_global_step() % n == 0]
+
+
+def always_record_summaries():
+  """Sets the should_record_summaries Tensor to always true."""
+  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
+  collection_ref[:] = [constant_op.constant(True)]
+
+
+def never_record_summaries():
+  """Sets the should_record_summaries Tensor to always false."""
+  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
+  collection_ref[:] = [constant_op.constant(False)]
+
+
+def create_summary_file_writer(logdir,
+                               max_queue=None,
+                               flush_secs=None,
+                               filename_suffix=None):
+  """Creates a summary file writer in the current context."""
+  if max_queue is None:
+    max_queue = constant_op.constant(10)
+  if flush_secs is None:
+    flush_secs = constant_op.constant(120)
+  if filename_suffix is None:
+    filename_suffix = constant_op.constant("")
+  resource = gen_summary_ops.summary_writer()
+  gen_summary_ops.create_summary_file_writer(resource, logdir, max_queue,
+                                             flush_secs, filename_suffix)
+  context.context().summary_writer_resource = resource
+
+
+def _nothing():
+  """Convenient else branch for when summaries do not record."""
+  return
+
+
+def generic(name, tensor, metadata, family=None):
+  """Writes a tensor summary if possible."""
+
+  def record():
+    with summary_op_util.summary_scope(
+        name, family, values=[tensor]) as (tag, scope):
+      gen_summary_ops.write_summary(context.context().summary_writer_resource,
+                                    training_util.get_global_step(), tensor,
+                                    tag, metadata, name=scope)
+  return control_flow_ops.cond(should_record_summaries(), record, _nothing)
+
+
+def scalar(name, tensor, family=None):
+  """Writes a scalar summary if possible."""
+
+  def record():
+    with summary_op_util.summary_scope(
+        name, family, values=[tensor]) as (tag, scope):
+      gen_summary_ops.write_scalar_summary(
+          context.context().summary_writer_resource,
+          training_util.get_global_step(), tag, tensor, name=scope)
+
+  return control_flow_ops.cond(should_record_summaries(), record, _nothing)
+
+
+def histogram(name, tensor, family=None):
+  """Writes a histogram summary if possible."""
+
+  def record():
+    with summary_op_util.summary_scope(
+        name, family, values=[tensor]) as (tag, scope):
+      gen_summary_ops.write_histogram_summary(
+          context.context().summary_writer_resource,
+          training_util.get_global_step(), tag, tensor, name=scope)
+
+  return control_flow_ops.cond(should_record_summaries(), record, _nothing)
+
+
+def image(name, tensor, bad_color=None, max_images=3, family=None):
+  """Writes an image summary if possible."""
+
+  def record():
+    if bad_color is None:
+      bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
+    with summary_op_util.summary_scope(
+        name, family, values=[tensor]) as (tag, scope):
+      gen_summary_ops.write_image_summary(
+          context.context().summary_writer_resource,
+          training_util.get_global_step(), tag, tensor, bad_color_, max_images,
+          name=scope)
+
+  return control_flow_ops.cond(should_record_summaries(), record, _nothing)
+
+
+def audio(name, tensor, sample_rate, max_outputs, family=None):
+  """Writes an audio summary if possible."""
+
+  def record():
+    with summary_op_util.summary_scope(
+        name, family, values=[tensor]) as (tag, scope):
+      gen_summary_ops.write_audio_summary(
+          context.context().summary_writer_resource,
+          training_util.get_global_step(),
+          tag,
+          tensor,
+          sample_rate=sample_rate,
+          max_outputs=max_outputs,
+          name=scope)
+
+  return control_flow_ops.cond(should_record_summaries(), record, _nothing)
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
new file mode 100644
index 00000000000..56c1a16f7f0
--- /dev/null
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+from tensorflow.contrib.summary import summary_ops
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import gfile
+from tensorflow.python.training import training_util
+
+
+class TargetTest(test_util.TensorFlowTestCase):
+
+  def testShouldRecordSummary(self):
+    self.assertFalse(summary_ops.should_record_summaries().numpy())
+    summary_ops.always_record_summaries()
+    self.assertTrue(summary_ops.should_record_summaries().numpy())
+
+  def testSummaryOps(self):
+    training_util.get_or_create_global_step()
+    logdir = tempfile.mkdtemp()
+    summary_ops.create_summary_file_writer(logdir, max_queue=0)
+    summary_ops.always_record_summaries()
+    summary_ops.generic('tensor', 1, '')
+    summary_ops.scalar('scalar', 2.0)
+    summary_ops.histogram('histogram', [1.0])
+    summary_ops.image('image', [[[[1.0]]]])
+    summary_ops.audio('audio', [[1.0]], 1.0, 1)
+    # The working condition of the ops is tested in the C++ test so we just
+    # test here that we're calling them correctly.
+    self.assertTrue(gfile.Exists(logdir))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 7d1325e0466..c952288704a 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -39,7 +39,6 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
-        "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 9157c79a907..a567d1bbb08 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -14,19 +14,30 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-cc_binary(
-    name = "capture_tpu_profile",
-    srcs = ["capture_tpu_profile.cc"],
-    visibility = ["//tensorflow/contrib/tpu/profiler:__subpackages__"],
+cc_library(
+    name = "dump_tpu_profile",
+    srcs = ["dump_tpu_profile.cc"],
+    hdrs = ["dump_tpu_profile.h"],
     deps = [
         ":op_profile_proto_cc",
         ":tpu_profiler_proto_cc",
         ":trace_events_proto_cc",
         ":trace_events_to_json",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_binary(
+    name = "capture_tpu_profile",
+    srcs = ["capture_tpu_profile.cc"],
+    visibility = ["//tensorflow/contrib/tpu/profiler:__subpackages__"],
+    deps = [
+        ":dump_tpu_profile",
+        ":tpu_profiler_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/platform/cloud:gcs_file_system",
         "@grpc//:grpc++_unsecure",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index a0dc15249f7..5b51a72ece8 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -24,22 +24,12 @@ limitations under the License.
 #include <ctime>
 #include <vector>
 
-#include "tensorflow/contrib/tpu/profiler/op_profile.pb.h"
+#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/io/compression.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/command_line_flags.h"
-#include "tensorflow/core/util/events_writer.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -47,16 +37,6 @@ namespace {
 
 using ::tensorflow::TPUProfiler;
 
-using ::grpc::ClientContext;
-using ::tensorflow::io::JoinPath;
-using ::tensorflow::protobuf::util::JsonOptions;
-using ::tensorflow::protobuf::util::MessageToJsonString;
-
-constexpr char kProfilePluginDirectory[] = "plugins/profile/";
-constexpr char kJsonOpProfileFileName[] = "op_profile.json";
-constexpr char kProtoTraceFileName[] = "trace";
-constexpr char kJsonTraceFileName[] = "trace.json.gz";
-constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
 constexpr uint64 kMaxEvents = 1000000;
 
 string GetCurrentTimeStampAsString() {
@@ -66,65 +46,13 @@ string GetCurrentTimeStampAsString() {
   return s;
 }
 
-Status WriteGzippedDataToFile(const string& filename, const string& data) {
-  std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filename, &file));
-  io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP();
-  io::ZlibOutputBuffer buffer(file.get(), options.input_buffer_size,
-                              options.output_buffer_size, options);
-  TF_RETURN_IF_ERROR(buffer.Init());
-  TF_RETURN_IF_ERROR(buffer.Append(data));
-  TF_RETURN_IF_ERROR(buffer.Close());
-  TF_RETURN_IF_ERROR(file->Close());
-  return Status::OK();
-}
-
-// Dumps profile data to <logdir>/plugins/profile/<run>/.
-inline string CreateProfileRunDirectory(const string& logdir,
-                                        const string& run) {
-  string run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
-  TF_CHECK_OK(Env::Default()->RecursivelyCreateDir(run_dir));
-  return run_dir;
-}
-
-void DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace) {
-  string proto_path = JoinPath(run_dir, kProtoTraceFileName);
-  TF_CHECK_OK(WriteStringToFile(Env::Default(), proto_path, encoded_trace));
-  LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
-
-  string json_path = JoinPath(run_dir, kJsonTraceFileName);
-  Trace trace;
-  trace.ParseFromString(encoded_trace);
-  std::cout << "Trace contains " << trace.trace_events_size() << " events."
-            << std::endl;
-  TF_CHECK_OK(WriteGzippedDataToFile(json_path, TraceEventsToJson(trace)));
-  std::cout << "Dumped JSON trace data to " << json_path << std::endl;
-}
-
-void DumpOpProfileToLogDirectory(StringPiece run_dir,
-                                 const tpu::op_profile::Profile& profile) {
-  string path = JoinPath(run_dir, kJsonOpProfileFileName);
-  string json;
-  JsonOptions options;
-  options.always_print_primitive_fields = true;
-  auto status = MessageToJsonString(profile, &json, options);
-  if (!status.ok()) {
-    std::cerr << "Failed to convert op profile to json. Skipping... "
-              << status.error_message() << std::endl;
-    return;
-  }
-  TF_CHECK_OK(WriteStringToFile(Env::Default(), path, json));
-  std::cout << "Dumped json op profile data to " << path << std::endl;
-}
-
 ProfileResponse Profile(const string& service_addr, int duration_ms) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
-  ProfileResponse response;
-  ClientContext context;
+  ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
   channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
@@ -132,20 +60,11 @@ ProfileResponse Profile(const string& service_addr, int duration_ms) {
   std::unique_ptr<TPUProfiler::Stub> stub =
       TPUProfiler::NewStub(::grpc::CreateCustomChannel(
           service_addr, ::grpc::InsecureChannelCredentials(), channel_args));
+  ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
   return response;
 }
 
-void DumpGraph(StringPiece logdir, StringPiece run, const string& graph_def) {
-  // The graph plugin expects the graph in <logdir>/<run>/<event.file>.
-  string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run));
-  TF_CHECK_OK(Env::Default()->RecursivelyCreateDir(run_dir));
-  EventsWriter event_writer(JoinPath(run_dir, "events"));
-  Event event;
-  event.set_graph_def(graph_def);
-  event_writer.WriteEvent(event);
-}
-
 }  // namespace
 }  // namespace tpu
 }  // namespace tensorflow
@@ -176,35 +95,8 @@ int main(int argc, char** argv) {
       tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms);
   // Use the current timestamp as the run name.
   tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
-  tensorflow::string run_dir =
-      tensorflow::tpu::CreateProfileRunDirectory(FLAGS_logdir, run);
-  // Ignore computation_graph for now.
-  if (response.encoded_trace().empty()) {
-    std::cout << "No trace event is collected during the " << duration_ms
-              << "ms interval." << std::endl;
-  } else {
-    LOG(INFO) << "Converting trace events to TraceViewer JSON.";
-    tensorflow::tpu::DumpTraceToLogDirectory(run_dir, response.encoded_trace());
-  }
-  int num_graphs = response.computation_graph_size();
-  if (num_graphs > 0) {
-    // The server might generates multiple graphs for one program; we simply
-    // pick the first one.
-    if (num_graphs > 1) {
-      std::cout << num_graphs
-                << " TPU program variants observed over the profiling period. "
-                << "One computation graph will be chosen arbitrarily."
-                << std::endl;
-    }
-    tensorflow::tpu::DumpGraph(
-        FLAGS_logdir, run, response.computation_graph(0).SerializeAsString());
-  }
-  if (response.has_op_profile() &&
-      (response.op_profile().has_by_program_structure() ||
-       response.op_profile().has_by_category())) {
-    tensorflow::tpu::DumpOpProfileToLogDirectory(run_dir,
-                                                 response.op_profile());
-  }
+  TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
+      FLAGS_logdir, run, response, &std::cout));
   // Print this at the end so that it's not buried in irrelevant LOG messages.
   std::cout
       << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
new file mode 100644
index 00000000000..7541a5291d1
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
+
+#include <cstdio>
+#include <ctime>
+#include <vector>
+
+#include "tensorflow/contrib/tpu/profiler/op_profile.pb.h"
+#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
+#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/compression.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/event.pb.h"
+#include "tensorflow/core/util/events_writer.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+
+using ::tensorflow::io::JoinPath;
+using ::tensorflow::protobuf::util::JsonOptions;
+using ::tensorflow::protobuf::util::MessageToJsonString;
+
+constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+constexpr char kJsonOpProfileFileName[] = "op_profile.json";
+constexpr char kProtoTraceFileName[] = "trace";
+constexpr char kJsonTraceFileName[] = "trace.json.gz";
+constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
+
+Status WriteGzippedDataToFile(const string& filename, const string& data) {
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filename, &file));
+  io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP();
+  io::ZlibOutputBuffer buffer(file.get(), options.input_buffer_size,
+                              options.output_buffer_size, options);
+  TF_RETURN_IF_ERROR(buffer.Init());
+  TF_RETURN_IF_ERROR(buffer.Append(data));
+  TF_RETURN_IF_ERROR(buffer.Close());
+  TF_RETURN_IF_ERROR(file->Close());
+  return Status::OK();
+}
+
+Status DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace,
+                               std::ostream* os) {
+  string proto_path = JoinPath(run_dir, kProtoTraceFileName);
+  TF_RETURN_IF_ERROR(
+      WriteStringToFile(Env::Default(), proto_path, encoded_trace));
+  LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
+
+  string json_path = JoinPath(run_dir, kJsonTraceFileName);
+  Trace trace;
+  trace.ParseFromString(encoded_trace);
+  *os << "Trace contains " << trace.trace_events_size() << " events."
+      << std::endl;
+  TF_RETURN_IF_ERROR(
+      WriteGzippedDataToFile(json_path, TraceEventsToJson(trace)));
+  *os << "Dumped JSON trace data to " << json_path << std::endl;
+  return Status::OK();
+}
+
+Status DumpOpProfileToLogDirectory(StringPiece run_dir,
+                                   const tpu::op_profile::Profile& profile,
+                                   std::ostream* os) {
+  string path = JoinPath(run_dir, kJsonOpProfileFileName);
+  string json;
+  JsonOptions options;
+  options.always_print_primitive_fields = true;
+  auto status = MessageToJsonString(profile, &json, options);
+  if (!status.ok()) {
+    return errors::Internal(
+        "Failed to convert op profile to json. Skipping... ",
+        string(status.error_message()));
+  }
+  TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
+  *os << "Dumped json op profile data to " << path << std::endl;
+  return Status::OK();
+}
+
+Status DumpGraphEvents(const string& logdir, const string& run,
+                       const ProfileResponse& response, std::ostream* os) {
+  int num_graphs = response.computation_graph_size();
+  if (response.computation_graph_size() == 0) return Status::OK();
+  // The server might generates multiple graphs for one program; we simply
+  // pick the first one.
+  if (num_graphs > 1) {
+    *os << num_graphs
+        << " TPU program variants observed over the profiling period. "
+        << "One computation graph will be chosen arbitrarily." << std::endl;
+  }
+  // The graph plugin expects the graph in <logdir>/<run>/<event.file>.
+  string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run));
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir));
+  EventsWriter event_writer(JoinPath(run_dir, "events"));
+  Event event;
+  // Add the computation graph.
+  event.set_graph_def(response.computation_graph(0).SerializeAsString());
+  event_writer.WriteEvent(event);
+  *os << "Wrote a HLO graph to " << event_writer.FileName() << std::endl;
+
+  if (response.has_hlo_metadata()) {
+    tensorflow::TaggedRunMetadata tagged_run_metadata;
+    tagged_run_metadata.set_tag(run);
+    tagged_run_metadata.set_run_metadata(
+        response.hlo_metadata().SerializeAsString());
+    tensorflow::Event meta_event;
+    *meta_event.mutable_tagged_run_metadata() = tagged_run_metadata;
+    event_writer.WriteEvent(meta_event);
+    *os << "Wrote HLO ops run metadata to " << event_writer.FileName()
+        << std::endl;
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const ProfileResponse& response,
+                                  std::ostream* os) {
+  // Dumps profile data to <logdir>/plugins/profile/<run>/.
+  string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
+  // Ignore computation_graph for now.
+  if (response.encoded_trace().empty()) {
+    *os << "No trace event is collected." << std::endl;
+  } else {
+    LOG(INFO) << "Converting trace events to TraceViewer JSON.";
+    TF_RETURN_IF_ERROR(
+        DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
+  }
+  if (response.has_op_profile() &&
+      (response.op_profile().has_by_program_structure() ||
+       response.op_profile().has_by_category())) {
+    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
+                                                   response.op_profile(), os));
+  }
+
+  TF_RETURN_IF_ERROR(DumpGraphEvents(logdir, run, response, os));
+
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
new file mode 100644
index 00000000000..65b92aa4186
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+
+#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Dumps all profiling tool data in a TPU profile to a TensorBoard log directory
+// with the given run name. This writes user-facing log messages to `os`.
+// The following tools are supported:
+//   - Trace viewer
+//   - Op profile
+//   - HLO computation graph
+Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const ProfileResponse& response,
+                                  std::ostream* os);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index d0a27f1a3d5..88e86eca3b6 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 package tensorflow;
 
 import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/contrib/tpu/profiler/op_profile.proto";
 
 // The TPUProfiler service retrieves performance information about
@@ -31,6 +32,10 @@ message ProfileResponse {
   // Graphs of programs executed on TPUs during the profiling period.
   repeated GraphDef computation_graph = 2;
 
+  // Performance profile that can be used to annotate HLO operations in the
+  // computation graph.
+  RunMetadata hlo_metadata = 5;
+
   // Encoded Trace proto message that contains metadata about the trace captured
   // during the profiling period. Describes the devices and resources that
   // 'trace_events' refers to.
@@ -40,4 +45,5 @@ message ProfileResponse {
   // If the trace covers multiple programs, the longest-running one is analyzed.
   // See op_profile.proto for the detailed semantics of the returned profile.
   tpu.op_profile.Profile op_profile = 4;
+  // next-field: 6
 }
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7c883ec9266..6748a765623 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -102,10 +102,12 @@ def _increase_eval_step_op(iterations_per_loop):
                               use_locking=True)
 
 
-def _tpu_job(run_config):
+def _tpu_job(run_config, mode):
   # The tpu job is determined by the run_config. Right now, this method is
   # required as tpu_config is not part of the RunConfig.
-  return None if run_config.master in ['', 'local'] else 'tpu_worker'
+  master = (run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL
+            else run_config.master)
+  return None if master in ['', 'local'] else 'tpu_worker'
 
 
 def _is_running_on_cpu(use_tpu, mode, eval_batch_size):
@@ -265,9 +267,9 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
      dequeue.
   """
 
-  def __init__(self, run_config, enqueue_fn, dequeue_ops=None):
+  def __init__(self, run_config, mode, enqueue_fn, dequeue_ops=None):
     self._iterations = run_config.tpu_config.iterations_per_loop
-    self._tpu_job = _tpu_job(run_config)
+    self._tpu_job = _tpu_job(run_config, mode)
     self._enqueue_fn = enqueue_fn
     self._dequeue_ops = dequeue_ops
 
@@ -899,7 +901,7 @@ class _EvalMetrics(object):
     """
 
     num_shards = run_config.tpu_config.num_shards
-    job = _tpu_job(run_config)
+    job = _tpu_job(run_config, model_fn_lib.ModeKeys.EVAL)
     job_device = '' if job is None else ('/job:%s' % job)
 
     # For each i, dequeue_ops[i] is a list containing the tensors from all
@@ -978,18 +980,20 @@ class TPUEstimator(estimator_lib.Estimator):
 
   Example (MNIST):
   ```
+  # The metric Fn which runs on CPU.
+  def metric_fn(labels, logits):
+    predictions = tf.argmax(logits, 1)
+    return {
+      'accuracy': tf.metrics.precision(
+          labels=labels, predictions=predictions),
+    }
+
+  # Your model Fn which runs on TPU.
   def model_fn(features, labels, mode, config, params):
     ...
     logits = ...
 
     if mode = tf.estimator.ModeKeys.EVAL:
-      def metric_fn(labels, logits):
-        predictions = tf.argmax(logits, 1)
-        return {
-          'precision': tf.metrics.precision(
-              labels=labels, predictions=predictions),
-        }
-
       return tpu_estimator.TPUEstimatorSpec(
           mode=mode,
           loss=loss,
@@ -1162,7 +1166,7 @@ class TPUEstimator(estimator_lib.Estimator):
       with ops.device('/device:CPU:0'):
         return input_fn(**kwargs)
 
-    job = _tpu_job(config)
+    job = _tpu_job(config, mode)
     def placement_function(index):
       if job is None:
         return '/replica:0/task:0/device:CPU:0'
@@ -1190,13 +1194,14 @@ class TPUEstimator(estimator_lib.Estimator):
 
 # TODO(b/64607814): Ensure batch_axis works with nested structures.
 def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder, run_config,
-                                              batch_axis):
+                                              batch_axis, mode):
   """Utility to convert input_fn to enqueue and dequeue fns for TPU.
 
   Args:
     inputs_holder: An `_InputsHolder` holding features and labels.
     run_config: A `RunConfig` instance.
     batch_axis: A python list of batch dimensions.
+    mode: ModeKeys
 
   Returns:
     A tuple of (dequeue_fn, enqueue_fn)
@@ -1239,7 +1244,7 @@ def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder, run_config,
       return infeed_queue.generate_enqueue_ops(
           sharded_inputs, tpu_ordinal_function=tpu_ordinal_function)
     else:
-      job = _tpu_job(run_config)
+      job = _tpu_job(run_config, mode)
       def placement_function(index):
         if job is None:
           return '/replica:0/task:0/device:CPU:0'
@@ -1271,12 +1276,12 @@ def _augment_model_fn(model_fn, train_batch_size, eval_batch_size, use_tpu,
                            num_shards=config.tpu_config.num_shards)
 
     dequeue_fn, enqueue_fn = _create_infeed_enqueue_ops_and_dequeue_fn(
-        inputs, config, batch_axis)
+        inputs, config, batch_axis, mode)
 
     if mode == model_fn_lib.ModeKeys.TRAIN:
       loss = _train_on_tpu_system(model_fn_wrapper, dequeue_fn)
       hooks = [
-          TPUInfeedOutfeedSessionHook(config, enqueue_fn),
+          TPUInfeedOutfeedSessionHook(config, mode, enqueue_fn),
           training.LoggingTensorHook(
               {'loss': array_ops.identity(loss),
                'step': training.get_global_step()},
@@ -1318,7 +1323,7 @@ def _augment_model_fn(model_fn, train_batch_size, eval_batch_size, use_tpu,
         eval_metric_ops.to_metric_metric_ops_for_tpu(
             config, dummy_update_op))
     hooks = [
-        TPUInfeedOutfeedSessionHook(config, enqueue_fn, eval_update_ops),
+        TPUInfeedOutfeedSessionHook(config, mode, enqueue_fn, eval_update_ops),
     ]
 
     return model_fn_lib.EstimatorSpec(
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index de9eb057e48..9db2ed830f4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -122,6 +122,7 @@ load(
     "tf_additional_gpu_tracer_cuda_deps",
     "tf_pyclif_proto_library",
     "tf_jspb_proto_library",
+    "tf_nano_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -212,6 +213,15 @@ tf_jspb_proto_library(
     deps = [":protos_all_cc"],
 )
 
+tf_nano_proto_library(
+    name = "protos_all_nano_proto",
+    field_style = "accessors",
+    generate_equals = 1,
+    generate_intdefs = 1,
+    visibility = ["//visibility:public"],
+    deps = [":protos_all_cc"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
@@ -556,6 +566,7 @@ tf_gen_op_libs(
         "state_ops",
         "stateless_random_ops",
         "string_ops",
+        "summary_ops",
         "training_ops",
     ],
 )
@@ -766,6 +777,7 @@ cc_library(
         "//tensorflow/core/kernels:state",
         "//tensorflow/core/kernels:stateless_random_ops",
         "//tensorflow/core/kernels:string",
+        "//tensorflow/core/kernels:summary_kernels",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
     ] + tf_additional_cloud_kernel_deps() + if_not_windows([
@@ -2144,8 +2156,6 @@ tf_cc_tests(
         "platform/port_test.cc",
         "platform/profile_utils/cpu_utils_test.cc",
         "platform/subprocess_test.cc",
-        "platform/vmodule_benchmark_test.cc",
-        "platform/vmodule_test.cc",
     ],
     deps = [
         ":lib",
@@ -3079,7 +3089,6 @@ cc_test(
     srcs = ["example/example_parser_configuration_test.cc"],
     data = [":example_parser_configuration_testdata"],
     deps = [
-        ":core",
         ":core_cpu",
         ":core_cpu_internal",
         ":direct_session_internal",
diff --git a/tensorflow/core/example/feature_util.cc b/tensorflow/core/example/feature_util.cc
index 6f3cc6c6c5d..f0593ede82f 100644
--- a/tensorflow/core/example/feature_util.cc
+++ b/tensorflow/core/example/feature_util.cc
@@ -18,77 +18,129 @@ limitations under the License.
 namespace tensorflow {
 
 namespace internal {
-
-::tensorflow::Feature& ExampleFeature(const string& name,
-                                      ::tensorflow::Example* example) {
-  ::tensorflow::Features* features = example->mutable_features();
-  return (*features->mutable_feature())[name];
+Feature& ExampleFeature(const string& name, Example* example) {
+  return *GetFeature(name, example);
 }
 
-}  //  namespace internal
+}  // namespace internal
 
 template <>
-bool ExampleHasFeature<protobuf_int64>(const string& name,
-                                       const Example& example) {
-  auto it = example.features().feature().find(name);
-  return (it != example.features().feature().end()) &&
+bool HasFeature<>(const string& key, const Features& features) {
+  return (features.feature().find(key) != features.feature().end());
+}
+
+template <>
+bool HasFeature<protobuf_int64>(const string& key, const Features& features) {
+  auto it = features.feature().find(key);
+  return (it != features.feature().end()) &&
          (it->second.kind_case() == Feature::KindCase::kInt64List);
 }
 
 template <>
-bool ExampleHasFeature<float>(const string& name, const Example& example) {
-  auto it = example.features().feature().find(name);
-  return (it != example.features().feature().end()) &&
+bool HasFeature<float>(const string& key, const Features& features) {
+  auto it = features.feature().find(key);
+  return (it != features.feature().end()) &&
          (it->second.kind_case() == Feature::KindCase::kFloatList);
 }
 
 template <>
-bool ExampleHasFeature<string>(const string& name, const Example& example) {
-  auto it = example.features().feature().find(name);
-  return (it != example.features().feature().end()) &&
+bool HasFeature<string>(const string& key, const Features& features) {
+  auto it = features.feature().find(key);
+  return (it != features.feature().end()) &&
          (it->second.kind_case() == Feature::KindCase::kBytesList);
 }
 
+bool HasFeatureList(const string& key,
+                    const SequenceExample& sequence_example) {
+  auto& feature_list = sequence_example.feature_lists().feature_list();
+  return (feature_list.find(key) != feature_list.end());
+}
+
 template <>
 const protobuf::RepeatedField<protobuf_int64>& GetFeatureValues<protobuf_int64>(
-    const string& name, const Example& example) {
-  return example.features().feature().at(name).int64_list().value();
+    const Feature& feature) {
+  return feature.int64_list().value();
 }
 
 template <>
 protobuf::RepeatedField<protobuf_int64>* GetFeatureValues<protobuf_int64>(
-    const string& name, Example* example) {
-  return internal::ExampleFeature(name, example)
-      .mutable_int64_list()
-      ->mutable_value();
+    Feature* feature) {
+  return feature->mutable_int64_list()->mutable_value();
 }
 
 template <>
 const protobuf::RepeatedField<float>& GetFeatureValues<float>(
-    const string& name, const Example& example) {
-  return example.features().feature().at(name).float_list().value();
+    const Feature& feature) {
+  return feature.float_list().value();
 }
 
 template <>
-protobuf::RepeatedField<float>* GetFeatureValues<float>(const string& name,
-                                                        Example* example) {
-  return internal::ExampleFeature(name, example)
-      .mutable_float_list()
-      ->mutable_value();
+protobuf::RepeatedField<float>* GetFeatureValues<float>(Feature* feature) {
+  return feature->mutable_float_list()->mutable_value();
 }
 
 template <>
 const protobuf::RepeatedPtrField<string>& GetFeatureValues<string>(
-    const string& name, const Example& example) {
-  return example.features().feature().at(name).bytes_list().value();
+    const Feature& feature) {
+  return feature.bytes_list().value();
 }
 
 template <>
-protobuf::RepeatedPtrField<string>* GetFeatureValues<string>(const string& name,
-                                                             Example* example) {
-  return internal::ExampleFeature(name, example)
-      .mutable_bytes_list()
-      ->mutable_value();
+protobuf::RepeatedPtrField<string>* GetFeatureValues<string>(Feature* feature) {
+  return feature->mutable_bytes_list()->mutable_value();
 }
 
+const protobuf::RepeatedPtrField<Feature>& GetFeatureList(
+    const string& key, const SequenceExample& sequence_example) {
+  return sequence_example.feature_lists().feature_list().at(key).feature();
+}
+
+protobuf::RepeatedPtrField<Feature>* GetFeatureList(
+    const string& feature_list_key, SequenceExample* sequence_example) {
+  return (*sequence_example->mutable_feature_lists()
+               ->mutable_feature_list())[feature_list_key]
+      .mutable_feature();
+}
+
+template <>
+Features* GetFeatures<Features>(Features* proto) {
+  return proto;
+}
+
+template <>
+Features* GetFeatures<Example>(Example* proto) {
+  return proto->mutable_features();
+}
+
+template <>
+const Features& GetFeatures<Features>(const Features& proto) {
+  return proto;
+}
+
+template <>
+const Features& GetFeatures<Example>(const Example& proto) {
+  return proto.features();
+}
+
+template <>
+const protobuf::RepeatedField<protobuf_int64>& GetFeatureValues<protobuf_int64>(
+    const Feature& feature);
+
+template <>
+protobuf::RepeatedField<protobuf_int64>* GetFeatureValues<protobuf_int64>(
+    Feature* feature);
+
+template <>
+const protobuf::RepeatedField<float>& GetFeatureValues<float>(
+    const Feature& feature);
+
+template <>
+protobuf::RepeatedField<float>* GetFeatureValues<float>(Feature* feature);
+
+template <>
+const protobuf::RepeatedPtrField<string>& GetFeatureValues<string>(
+    const Feature& feature);
+
+template <>
+protobuf::RepeatedPtrField<string>* GetFeatureValues<string>(Feature* feature);
 }  // namespace tensorflow
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index 4004411cb17..a87c2c9a57c 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// A set of lightweight wrappers which simplify access to Example features.
+// A set of lightweight wrappers which simplify access to Feature protos.
 //
 // TensorFlow Example proto uses associative maps on top of oneof fields.
+// SequenceExample proto uses associative map of FeatureList.
 // So accessing feature values is not very convenient.
 //
 // For example, to read a first value of integer feature "tag":
@@ -42,9 +43,59 @@ limitations under the License.
 // (RepeatedPtrField for byte list). So refer to its documentation of
 // RepeatedField for full list of supported methods.
 //
-// NOTE: It is also important to mention that due to the nature of oneof proto
-// fields setting a feature of one type automatically clears all values stored
-// as another type with the same feature name.
+// NOTE: Due to the nature of oneof proto fields setting a feature of one type
+// automatically clears all values stored as another type with the same feature
+// key.
+//
+// This library also has tools to work with SequenceExample protos.
+//
+// To get a value from SequenceExample.context:
+//   int id = GetFeatureValues<protobuf_int64>("tag", se.context()).Get(0);
+// To add a value to the context:
+//   GetFeatureValues<protobuf_int64>("tag", se.mutable_context())->Add(42);
+//
+// To add values to feature_lists:
+//   AppendFeatureValues({4.0},
+//                       GetFeatureList("movie_ratings", &se)->Add());
+//   AppendFeatureValues({5.0, 3.0},
+//                       GetFeatureList("movie_ratings", &se)->Add());
+// This will create a feature list keyed as "images" with two features:
+//   feature_lists {
+//     feature_list {
+//       key: "images"
+//       value {
+//         feature { float_list { value: [4.0] } }
+//         feature { float_list { value: [5.0, 3.0] } }
+//       }
+//     } }
+//
+// Functions exposed by this library:
+//   HasFeature<[FeatureType]>(key, proto) -> bool
+//     Returns true if a feature with the specified key, and optionally
+//     FeatureType, belongs to the Features or Example proto.
+//   HasFeatureList(key, sequence_example) -> bool
+//     Returns true if SequenceExample has a feature_list with the key.
+//   GetFeatureValues<FeatureType>(key, proto) -> RepeatedField<FeatureType>
+//     Returns values for the specified key and the FeatureType.
+//     Supported types for the proto: Example, Features.
+//   GetFeatureList(key, sequence_example) -> RepeatedPtrField<Feature>
+//     Returns Feature protos associated with a key.
+//   AppendFeatureValues(begin, end, feature)
+//   AppendFeatureValues(container or initializer_list, feature)
+//     Copies values into a Feature.
+//   AppendFeatureValues(begin, end, key, proto)
+//   AppendFeatureValues(container or initializer_list, key, proto)
+//     Copies values into Features and Example protos with the specified key.
+//
+// Auxiliary functions, it is unlikely you'll need to use them directly:
+//   GetFeatures(proto) -> Features
+//     A convenience function to get Features proto.
+//     Supported types for the proto: Example, Features.
+//   GetFeature(key, proto) -> Feature*
+//     Returns a Feature proto for the specified key, creates a new if
+//     necessary. Supported types for the proto: Example, Features.
+//   GetFeatureValues<FeatureType>(feature) -> RepeatedField<FeatureType>
+//     Returns values of the feature for the FeatureType.
 
 #ifndef TENSORFLOW_EXAMPLE_FEATURE_H_
 #define TENSORFLOW_EXAMPLE_FEATURE_H_
@@ -62,10 +113,11 @@ namespace tensorflow {
 
 namespace internal {
 
+// DEPRECATED: Use GetFeature instead.
+// TODO(gorban): Update all clients in a followup CL.
 // Returns a reference to a feature corresponding to the name.
 // Note: it will create a new Feature if it is missing in the example.
-::tensorflow::Feature& ExampleFeature(const string& name,
-                                      ::tensorflow::Example* example);
+Feature& ExampleFeature(const string& name, Example* example);
 
 // Specializations of RepeatedFieldTrait define a type of RepeatedField
 // corresponding to a selected feature type.
@@ -127,89 +179,135 @@ struct FeatureTrait<
 
 }  //  namespace internal
 
-// Returns true if feature with the specified name belongs to the example proto.
-// Doesn't check feature type. Note that specialized versions return false if
-// the feature has a wrong type.
-template <typename FeatureType = void>
-bool ExampleHasFeature(const string& name, const Example& example) {
-  return example.features().feature().find(name) !=
-         example.features().feature().end();
-}
+// Returns true if sequence_example has a feature_list with the specified key.
+bool HasFeatureList(const string& key, const SequenceExample& sequence_example);
+
+// A family of template functions to return mutable Features proto from a
+// container proto. Supported ProtoTypes: Example, Features.
+template <typename ProtoType>
+Features* GetFeatures(ProtoType* proto);
+
+template <typename ProtoType>
+const Features& GetFeatures(const ProtoType& proto);
 
 // Base declaration of a family of template functions to return a read only
-// repeated field corresponding to a feature with the specified name.
+// repeated field of feature values.
 template <typename FeatureType>
 const typename internal::RepeatedFieldTrait<FeatureType>::Type&
-GetFeatureValues(const string& name, const Example& example);
+GetFeatureValues(const Feature& feature);
 
-// Base declaration of a family of template functions to return a mutable
-// repeated field corresponding to a feature with the specified name.
+// Returns a read only repeated field corresponding to a feature with the
+// specified name and FeatureType. Supported ProtoTypes: Example, Features.
+template <typename FeatureType, typename ProtoType>
+const typename internal::RepeatedFieldTrait<FeatureType>::Type&
+GetFeatureValues(const string& key, const ProtoType& proto) {
+  return GetFeatureValues<FeatureType>(GetFeatures(proto).feature().at(key));
+}
+
+// Returns a mutable repeated field of a feature values.
 template <typename FeatureType>
 typename internal::RepeatedFieldTrait<FeatureType>::Type* GetFeatureValues(
-    const string& name, Example* example);
+    Feature* feature);
+
+// Returns a mutable repeated field corresponding to a feature with the
+// specified name and FeatureType. Supported ProtoTypes: Example, Features.
+template <typename FeatureType, typename ProtoType>
+typename internal::RepeatedFieldTrait<FeatureType>::Type* GetFeatureValues(
+    const string& key, ProtoType* proto) {
+  ::tensorflow::Feature& feature =
+      (*GetFeatures(proto)->mutable_feature())[key];
+  return GetFeatureValues<FeatureType>(&feature);
+}
+
+// Returns a Feature proto for the specified key, creates a new if necessary.
+// Supported types for the proto: Example, Features.
+template <typename ProtoType>
+Feature* GetFeature(const string& key, ProtoType* proto) {
+  return &(*GetFeatures(proto)->mutable_feature())[key];
+}
+
+// Returns a repeated field with features corresponding to a feature_list key.
+const protobuf::RepeatedPtrField<Feature>& GetFeatureList(
+    const string& key, const SequenceExample& sequence_example);
+
+// Returns a mutable repeated field with features corresponding to a
+// feature_list key. It will create a new FeatureList if necessary.
+protobuf::RepeatedPtrField<Feature>* GetFeatureList(
+    const string& feature_list_key, SequenceExample* sequence_example);
 
-// Copies elements from the range, defined by [first, last) into a feature.
 template <typename IteratorType>
 void AppendFeatureValues(IteratorType first, IteratorType last,
-                         const string& name, Example* example) {
+                         Feature* feature) {
   using FeatureType = typename internal::FeatureTrait<
       typename std::iterator_traits<IteratorType>::value_type>::Type;
-  std::copy(first, last, protobuf::RepeatedFieldBackInserter(
-                             GetFeatureValues<FeatureType>(name, example)));
+  std::copy(first, last,
+            protobuf::RepeatedFieldBackInserter(
+                GetFeatureValues<FeatureType>(feature)));
+}
+
+template <typename ValueType>
+void AppendFeatureValues(std::initializer_list<ValueType> container,
+                         Feature* feature) {
+  AppendFeatureValues(container.begin(), container.end(), feature);
+}
+
+template <typename ContainerType>
+void AppendFeatureValues(const ContainerType& container, Feature* feature) {
+  using IteratorType = typename ContainerType::const_iterator;
+  AppendFeatureValues<IteratorType>(container.begin(), container.end(),
+                                    feature);
+}
+
+// Copies elements from the range, defined by [first, last) into the feature
+// obtainable from the (proto, key) combination.
+template <typename IteratorType, typename ProtoType>
+void AppendFeatureValues(IteratorType first, IteratorType last,
+                         const string& key, ProtoType* proto) {
+  AppendFeatureValues(first, last, GetFeature(key, GetFeatures(proto)));
 }
 
 // Copies all elements from the container into a feature.
-template <typename ContainerType>
-void AppendFeatureValues(const ContainerType& container, const string& name,
-                         Example* example) {
+template <typename ContainerType, typename ProtoType>
+void AppendFeatureValues(const ContainerType& container, const string& key,
+                         ProtoType* proto) {
   using IteratorType = typename ContainerType::const_iterator;
-  AppendFeatureValues<IteratorType>(container.begin(), container.end(), name,
-                                    example);
+  AppendFeatureValues<IteratorType>(container.begin(), container.end(), key,
+                                    proto);
 }
 
-// Copies all elements from the initializer list into a feature.
-template <typename ValueType>
+// Copies all elements from the initializer list into a Feature contained by
+// Features or Example proto.
+template <typename ValueType, typename ProtoType>
 void AppendFeatureValues(std::initializer_list<ValueType> container,
-                         const string& name, Example* example) {
+                         const string& key, ProtoType* proto) {
   using IteratorType =
       typename std::initializer_list<ValueType>::const_iterator;
-  AppendFeatureValues<IteratorType>(container.begin(), container.end(), name,
-                                    example);
+  AppendFeatureValues<IteratorType>(container.begin(), container.end(), key,
+                                    proto);
 }
 
-template <>
-bool ExampleHasFeature<protobuf_int64>(const string& name,
-                                       const Example& example);
+// Returns true if a feature with the specified key belongs to the Features.
+// The template parameter pack accepts zero or one template argument - which
+// is FeatureType. If the FeatureType not specified (zero template arguments)
+// the function will not check the feature type. Otherwise it will return false
+// if the feature has a wrong type.
+template <typename... FeatureType>
+bool HasFeature(const string& key, const Features& features);
 
-template <>
-bool ExampleHasFeature<float>(const string& name, const Example& example);
+// Returns true if a feature with the specified key belongs to the Example.
+// Doesn't check feature type if used without FeatureType, otherwise the
+// specialized versions return false if the feature has a wrong type.
+template <typename... FeatureType>
+bool HasFeature(const string& key, const Example& example) {
+  return HasFeature<FeatureType...>(key, GetFeatures(example));
+};
 
-template <>
-bool ExampleHasFeature<string>(const string& name, const Example& example);
-
-template <>
-const protobuf::RepeatedField<protobuf_int64>& GetFeatureValues<protobuf_int64>(
-    const string& name, const Example& example);
-
-template <>
-protobuf::RepeatedField<protobuf_int64>* GetFeatureValues<protobuf_int64>(
-    const string& name, Example* example);
-
-template <>
-const protobuf::RepeatedField<float>& GetFeatureValues<float>(
-    const string& name, const Example& example);
-
-template <>
-protobuf::RepeatedField<float>* GetFeatureValues<float>(const string& name,
-                                                        Example* example);
-
-template <>
-const protobuf::RepeatedPtrField<string>& GetFeatureValues<string>(
-    const string& name, const Example& example);
-
-template <>
-protobuf::RepeatedPtrField<string>* GetFeatureValues<string>(const string& name,
-                                                             Example* example);
+// DEPRECATED: use HasFeature instead.
+// TODO(gorban): update all clients in a followup CL.
+template <typename... FeatureType>
+bool ExampleHasFeature(const string& key, const Example& example) {
+  return HasFeature<FeatureType...>(key, example);
+}
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_EXAMPLE_FEATURE_H_
diff --git a/tensorflow/core/example/feature_util_test.cc b/tensorflow/core/example/feature_util_test.cc
index eb7b90af1b2..cd32dee306d 100644
--- a/tensorflow/core/example/feature_util_test.cc
+++ b/tensorflow/core/example/feature_util_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/example/feature_util.h"
 
 #include <vector>
@@ -38,6 +37,16 @@ TEST(GetFeatureValuesInt64Test, ReadsASingleValue) {
   EXPECT_EQ(42, tag.Get(0));
 }
 
+TEST(GetFeatureValuesInt64Test, ReadsASingleValueFromFeature) {
+  Feature feature;
+  feature.mutable_int64_list()->add_value(42);
+
+  auto values = GetFeatureValues<protobuf_int64>(feature);
+
+  ASSERT_EQ(1, values.size());
+  EXPECT_EQ(42, values.Get(0));
+}
+
 TEST(GetFeatureValuesInt64Test, WritesASingleValue) {
   Example example;
 
@@ -48,25 +57,33 @@ TEST(GetFeatureValuesInt64Test, WritesASingleValue) {
   EXPECT_EQ(42, example.features().feature().at("tag").int64_list().value(0));
 }
 
+TEST(GetFeatureValuesInt64Test, WritesASingleValueToFeature) {
+  Feature feature;
+
+  GetFeatureValues<protobuf_int64>(&feature)->Add(42);
+
+  ASSERT_EQ(1, feature.int64_list().value_size());
+  EXPECT_EQ(42, feature.int64_list().value(0));
+}
+
 TEST(GetFeatureValuesInt64Test, CheckUntypedFieldExistence) {
   Example example;
-
-  EXPECT_FALSE(ExampleHasFeature("tag", example));
+  ASSERT_FALSE(HasFeature("tag", example));
 
   GetFeatureValues<protobuf_int64>("tag", &example)->Add(0);
 
-  EXPECT_TRUE(ExampleHasFeature("tag", example));
+  EXPECT_TRUE(HasFeature("tag", example));
 }
 
 TEST(GetFeatureValuesInt64Test, CheckTypedFieldExistence) {
   Example example;
 
   GetFeatureValues<float>("tag", &example)->Add(3.14);
-  ASSERT_FALSE(ExampleHasFeature<protobuf_int64>("tag", example));
+  ASSERT_FALSE(HasFeature<protobuf_int64>("tag", example));
 
   GetFeatureValues<protobuf_int64>("tag", &example)->Add(42);
 
-  EXPECT_TRUE(ExampleHasFeature<protobuf_int64>("tag", example));
+  EXPECT_TRUE(HasFeature<protobuf_int64>("tag", example));
   auto tag_ro = GetFeatureValues<protobuf_int64>("tag", example);
   ASSERT_EQ(1, tag_ro.size());
   EXPECT_EQ(42, tag_ro.Get(0));
@@ -87,6 +104,16 @@ TEST(GetFeatureValuesInt64Test, CopyIterableToAField) {
   EXPECT_EQ(3, tag_ro.Get(2));
 }
 
+TEST(GetFeatureValuesFloatTest, ReadsASingleValueFromFeature) {
+  Feature feature;
+  feature.mutable_float_list()->add_value(3.14);
+
+  auto values = GetFeatureValues<float>(feature);
+
+  ASSERT_EQ(1, values.size());
+  EXPECT_NEAR(3.14, values.Get(0), kTolerance);
+}
+
 TEST(GetFeatureValuesFloatTest, ReadsASingleValue) {
   Example example;
   (*example.mutable_features()->mutable_feature())["tag"]
@@ -99,6 +126,15 @@ TEST(GetFeatureValuesFloatTest, ReadsASingleValue) {
   EXPECT_NEAR(3.14, tag.Get(0), kTolerance);
 }
 
+TEST(GetFeatureValuesFloatTest, WritesASingleValueToFeature) {
+  Feature feature;
+
+  GetFeatureValues<float>(&feature)->Add(3.14);
+
+  ASSERT_EQ(1, feature.float_list().value_size());
+  EXPECT_NEAR(3.14, feature.float_list().value(0), kTolerance);
+}
+
 TEST(GetFeatureValuesFloatTest, WritesASingleValue) {
   Example example;
 
@@ -114,6 +150,20 @@ TEST(GetFeatureValuesFloatTest, WritesASingleValue) {
 TEST(GetFeatureValuesFloatTest, CheckTypedFieldExistence) {
   Example example;
 
+  GetFeatureValues<protobuf_int64>("tag", &example)->Add(42);
+  ASSERT_FALSE(HasFeature<float>("tag", example));
+
+  GetFeatureValues<float>("tag", &example)->Add(3.14);
+
+  EXPECT_TRUE(HasFeature<float>("tag", example));
+  auto tag_ro = GetFeatureValues<float>("tag", example);
+  ASSERT_EQ(1, tag_ro.size());
+  EXPECT_NEAR(3.14, tag_ro.Get(0), kTolerance);
+}
+
+TEST(GetFeatureValuesFloatTest, CheckTypedFieldExistenceForDeprecatedMethod) {
+  Example example;
+
   GetFeatureValues<protobuf_int64>("tag", &example)->Add(42);
   ASSERT_FALSE(ExampleHasFeature<float>("tag", example));
 
@@ -125,6 +175,16 @@ TEST(GetFeatureValuesFloatTest, CheckTypedFieldExistence) {
   EXPECT_NEAR(3.14, tag_ro.Get(0), kTolerance);
 }
 
+TEST(GetFeatureValuesStringTest, ReadsASingleValueFromFeature) {
+  Feature feature;
+  feature.mutable_bytes_list()->add_value("FOO");
+
+  auto values = GetFeatureValues<string>(feature);
+
+  ASSERT_EQ(1, values.size());
+  EXPECT_EQ("FOO", values.Get(0));
+}
+
 TEST(GetFeatureValuesStringTest, ReadsASingleValue) {
   Example example;
   (*example.mutable_features()->mutable_feature())["tag"]
@@ -137,6 +197,15 @@ TEST(GetFeatureValuesStringTest, ReadsASingleValue) {
   EXPECT_EQ("FOO", tag.Get(0));
 }
 
+TEST(GetFeatureValuesStringTest, WritesASingleValueToFeature) {
+  Feature feature;
+
+  *GetFeatureValues<string>(&feature)->Add() = "FOO";
+
+  ASSERT_EQ(1, feature.bytes_list().value_size());
+  EXPECT_EQ("FOO", feature.bytes_list().value(0));
+}
+
 TEST(GetFeatureValuesStringTest, WritesASingleValue) {
   Example example;
 
@@ -148,15 +217,15 @@ TEST(GetFeatureValuesStringTest, WritesASingleValue) {
             example.features().feature().at("tag").bytes_list().value(0));
 }
 
-TEST(GetFeatureValuesBytesTest, CheckTypedFieldExistence) {
+TEST(GetFeatureValuesStringTest, CheckTypedFieldExistence) {
   Example example;
 
   GetFeatureValues<protobuf_int64>("tag", &example)->Add(42);
-  ASSERT_FALSE(ExampleHasFeature<string>("tag", example));
+  ASSERT_FALSE(HasFeature<string>("tag", example));
 
   *GetFeatureValues<string>("tag", &example)->Add() = "FOO";
 
-  EXPECT_TRUE(ExampleHasFeature<string>("tag", example));
+  EXPECT_TRUE(HasFeature<string>("tag", example));
   auto tag_ro = GetFeatureValues<string>("tag", example);
   ASSERT_EQ(1, tag_ro.size());
   EXPECT_EQ("FOO", tag_ro.Get(0));
@@ -228,5 +297,146 @@ TEST(AppendFeatureValuesTest, StringVariablesUsingInitializerList) {
   EXPECT_EQ("BAZ", tag_ro.Get(2));
 }
 
+TEST(SequenceExampleTest, ReadsASingleValueFromContext) {
+  SequenceExample se;
+  (*se.mutable_context()->mutable_feature())["tag"]
+      .mutable_int64_list()
+      ->add_value(42);
+
+  auto values = GetFeatureValues<protobuf_int64>("tag", se.context());
+
+  ASSERT_EQ(1, values.size());
+  EXPECT_EQ(42, values.Get(0));
+}
+
+TEST(SequenceExampleTest, WritesASingleValueToContext) {
+  SequenceExample se;
+
+  GetFeatureValues<protobuf_int64>("tag", se.mutable_context())->Add(42);
+
+  ASSERT_EQ(1, se.context().feature().at("tag").int64_list().value_size());
+  EXPECT_EQ(42, se.context().feature().at("tag").int64_list().value(0));
+}
+
+TEST(SequenceExampleTest, AppendFeatureValuesToContextSingleArg) {
+  SequenceExample se;
+
+  AppendFeatureValues({1.1, 2.2, 3.3}, "tag", se.mutable_context());
+
+  auto tag_ro = GetFeatureValues<float>("tag", se.context());
+  ASSERT_EQ(3, tag_ro.size());
+  EXPECT_NEAR(1.1, tag_ro.Get(0), kTolerance);
+  EXPECT_NEAR(2.2, tag_ro.Get(1), kTolerance);
+  EXPECT_NEAR(3.3, tag_ro.Get(2), kTolerance);
+}
+
+TEST(SequenceExampleTest, CheckTypedFieldExistence) {
+  SequenceExample se;
+
+  GetFeatureValues<float>("tag", se.mutable_context())->Add(3.14);
+  ASSERT_FALSE(HasFeature<protobuf_int64>("tag", se.context()));
+
+  GetFeatureValues<protobuf_int64>("tag", se.mutable_context())->Add(42);
+
+  EXPECT_TRUE(HasFeature<protobuf_int64>("tag", se.context()));
+  auto tag_ro = GetFeatureValues<protobuf_int64>("tag", se.context());
+  ASSERT_EQ(1, tag_ro.size());
+  EXPECT_EQ(42, tag_ro.Get(0));
+}
+
+TEST(SequenceExampleTest, ReturnsExistingFeatureLists) {
+  SequenceExample se;
+  (*se.mutable_feature_lists()->mutable_feature_list())["tag"]
+      .mutable_feature()
+      ->Add();
+
+  auto feature = GetFeatureList("tag", se);
+
+  ASSERT_EQ(1, feature.size());
+}
+
+TEST(SequenceExampleTest, CreatesNewFeatureLists) {
+  SequenceExample se;
+
+  GetFeatureList("tag", &se)->Add();
+
+  EXPECT_EQ(1, se.feature_lists().feature_list().at("tag").feature_size());
+}
+
+TEST(SequenceExampleTest, CheckFeatureListExistence) {
+  SequenceExample se;
+  ASSERT_FALSE(HasFeatureList("tag", se));
+
+  GetFeatureList("tag", &se)->Add();
+
+  ASSERT_TRUE(HasFeatureList("tag", se));
+}
+
+TEST(SequenceExampleTest, AppendFeatureValuesWithInitializerList) {
+  SequenceExample se;
+
+  AppendFeatureValues({1, 2, 3}, "ids", se.mutable_context());
+  AppendFeatureValues({"cam1-0", "cam2-0"},
+                      GetFeatureList("images", &se)->Add());
+  AppendFeatureValues({"cam1-1", "cam2-2"},
+                      GetFeatureList("images", &se)->Add());
+
+  EXPECT_EQ(se.DebugString(),
+            "context {\n"
+            "  feature {\n"
+            "    key: \"ids\"\n"
+            "    value {\n"
+            "      int64_list {\n"
+            "        value: 1\n"
+            "        value: 2\n"
+            "        value: 3\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "}\n"
+            "feature_lists {\n"
+            "  feature_list {\n"
+            "    key: \"images\"\n"
+            "    value {\n"
+            "      feature {\n"
+            "        bytes_list {\n"
+            "          value: \"cam1-0\"\n"
+            "          value: \"cam2-0\"\n"
+            "        }\n"
+            "      }\n"
+            "      feature {\n"
+            "        bytes_list {\n"
+            "          value: \"cam1-1\"\n"
+            "          value: \"cam2-2\"\n"
+            "        }\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "}\n");
+}
+
+TEST(SequenceExampleTest, AppendFeatureValuesWithVectors) {
+  SequenceExample se;
+
+  std::vector<float> readings{1.0, 2.5, 5.0};
+  AppendFeatureValues(readings, GetFeatureList("movie_ratings", &se)->Add());
+
+  EXPECT_EQ(se.DebugString(),
+            "feature_lists {\n"
+            "  feature_list {\n"
+            "    key: \"movie_ratings\"\n"
+            "    value {\n"
+            "      feature {\n"
+            "        float_list {\n"
+            "          value: 1\n"
+            "          value: 2.5\n"
+            "          value: 5\n"
+            "        }\n"
+            "      }\n"
+            "    }\n"
+            "  }\n"
+            "}\n");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index e7092f549b2..f5dadf76daf 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -117,16 +117,6 @@ class CPUAllocator : public Allocator {
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
 
-namespace {
-Allocator* MakeCpuAllocator() {
-  Allocator* allocator = new CPUAllocator;
-  if (cpu_allocator_collect_full_stats || LogMemory::IsEnabled()) {
-    allocator = new TrackingAllocator(allocator, true);
-  }
-  return allocator;
-}
-}  // namespace
-
 Allocator* cpu_allocator() {
   static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator();
   if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc
index 1cbed62939f..9da4828bbad 100644
--- a/tensorflow/core/framework/cancellation.cc
+++ b/tensorflow/core/framework/cancellation.cc
@@ -23,7 +23,9 @@ namespace tensorflow {
 const CancellationToken CancellationManager::kInvalidToken = -1;
 
 CancellationManager::CancellationManager()
-    : is_cancelling_(false), is_cancelled_(0), next_cancellation_token_(0) {}
+    : is_cancelling_(false),
+      is_cancelled_(false),
+      next_cancellation_token_(0) {}
 
 void CancellationManager::StartCancel() {
   gtl::FlatMap<CancellationToken, CancelCallback> callbacks_to_run;
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index c2d3f37ab30..b788d6b7778 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.h"
 
+#include <map>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -271,12 +272,17 @@ class FunctionInstantiationHelper {
       int nid = -1;
       const string node_name = input.substr(1);
       const string node_colon = node_name + ":";
-      for (const auto& p : index_) {
-        if (p.first == node_name ||
-            tensorflow::StringPiece(p.first).starts_with(node_colon)) {
-          nid = p.second.nid;
+      const string node_colon_bound = node_name + ";";
+      // index_ is a map sorted lexicographically, so the key we are looking for
+      // must lie in the range [node_name, node_colon_bound).
+      auto it = index_.lower_bound(node_name);
+      while (it != index_.end() && it->first <= node_colon_bound) {
+        if (it->first == node_name ||
+            tensorflow::StringPiece(it->first).starts_with(node_colon)) {
+          nid = it->second.nid;
           break;
         }
+        ++it;
       }
       if (nid == -1) {
         return errors::InvalidArgument("input[", i, "] == '", input,
@@ -421,7 +427,7 @@ class FunctionInstantiationHelper {
   GetFunctionSignature get_function_;
   InstantiationResult& result_;
   // A small index for all names that can be used as a node's input arguments.
-  std::unordered_map<string, NameInfoItem> index_;
+  std::map<string, NameInfoItem> index_;
   // This contains information about a node in the new graph including the node
   // names and input nodes' indexes.
   struct NodeInfo {
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h
index ab224aa7188..4c216a84f04 100644
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@@ -166,10 +166,11 @@ struct Expector<T, false> {
   static void Equal(const Tensor& x, const Tensor& y) {
     ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
     AssertSameTypeDims(x, y);
-    auto a = x.flat<T>();
-    auto b = y.flat<T>();
-    for (int i = 0; i < a.size(); ++i) {
-      ExpectEqual(a(i), b(i));
+    const auto size = x.NumElements();
+    const T* a = x.flat<T>().data();
+    const T* b = y.flat<T>().data();
+    for (int i = 0; i < size; ++i) {
+      ExpectEqual(a[i], b[i]);
     }
   }
 };
@@ -182,10 +183,11 @@ struct Expector<T, true> {
   static void Equal(const Tensor& x, const Tensor& y) {
     ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
     AssertSameTypeDims(x, y);
-    auto a = x.flat<T>();
-    auto b = y.flat<T>();
-    for (int i = 0; i < a.size(); ++i) {
-      ExpectEqual(a(i), b(i));
+    const auto size = x.NumElements();
+    const T* a = x.flat<T>().data();
+    const T* b = y.flat<T>().data();
+    for (int i = 0; i < size; ++i) {
+      ExpectEqual(a[i], b[i]);
     }
   }
 
@@ -199,10 +201,11 @@ struct Expector<T, true> {
   static void Near(const Tensor& x, const Tensor& y, const double abs_err) {
     ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
     AssertSameTypeDims(x, y);
-    auto a = x.flat<T>();
-    auto b = y.flat<T>();
-    for (int i = 0; i < a.size(); ++i) {
-      Near(a(i), b(i), abs_err, i);
+    const auto size = x.NumElements();
+    const T* a = x.flat<T>().data();
+    const T* b = y.flat<T>().data();
+    for (int i = 0; i < size; ++i) {
+      Near(a[i], b[i], abs_err, i);
     }
   }
 };
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index 11756c356aa..9cc7530459e 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -88,7 +88,17 @@ bool DecodeUnaryVariant(Variant* variant) {
   if (decode_fn == nullptr) {
     return false;
   }
-  return (*decode_fn)(variant);
+  const string type_name = variant->TypeName();
+  bool decoded = (*decode_fn)(variant);
+  if (!decoded) return false;
+  if (variant->TypeName() != type_name) {
+    LOG(ERROR) << "DecodeUnaryVariant: Variant type_name before decoding was: "
+               << type_name
+               << " but after decoding was: " << variant->TypeName()
+               << ".  Treating this as a failure.";
+    return false;
+  }
+  return true;
 }
 
 // Add some basic registrations for use by others, e.g., for testing.
@@ -101,15 +111,59 @@ string MaybeRemoveTFPrefix(const StringPiece& str) {
 }  // namespace
 
 #define REGISTER_VARIANT_DECODE_TYPE(T) \
-  REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, MaybeRemoveTFPrefix(TF_STR(T)));
+  REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, TF_STR(T));
 
 // No encode/decode registered for std::complex<> and Eigen::half
 // objects yet.
-TF_CALL_INTEGRAL_TYPES(REGISTER_VARIANT_DECODE_TYPE);
-TF_CALL_float(REGISTER_VARIANT_DECODE_TYPE);
-TF_CALL_double(REGISTER_VARIANT_DECODE_TYPE);
-TF_CALL_bool(REGISTER_VARIANT_DECODE_TYPE);
+REGISTER_VARIANT_DECODE_TYPE(int);
+REGISTER_VARIANT_DECODE_TYPE(float);
+REGISTER_VARIANT_DECODE_TYPE(bool);
+REGISTER_VARIANT_DECODE_TYPE(double);
 
 #undef REGISTER_VARIANT_DECODE_TYPE
 
+// Special casing ZerosLikeFn per device.
+UnaryVariantOpRegistry::VariantZerosLikeFn*
+UnaryVariantOpRegistry::GetZerosLikeFn(const string& device,
+                                       const string& type_name) {
+  auto found = zeros_like_fns.find(std::make_pair(device, type_name));
+  if (found == zeros_like_fns.end()) return nullptr;
+  return &found->second;
+}
+
+void UnaryVariantOpRegistry::RegisterZerosLikeFn(
+    const string& device, const string& type_name,
+    const VariantZerosLikeFn& zeros_like_fn) {
+  CHECK(!type_name.empty()) << "Need a valid name for UnaryVariantZerosLike";
+  VariantZerosLikeFn* existing = GetZerosLikeFn(device, type_name);
+  CHECK_EQ(existing, nullptr)
+      << "Unary VariantZerosLikeFn for type_name: " << type_name
+      << " already registered for device type: " << device;
+  zeros_like_fns.insert(
+      std::pair<std::pair<string, string>, VariantZerosLikeFn>(
+          std::make_pair(device, type_name), zeros_like_fn));
+}
+
+namespace {
+
+template <typename T>
+Status ZerosLikeVariantPrimitiveType(OpKernelContext* ctx, const T& t,
+                                     T* t_out) {
+  *t_out = T(0);
+  return Status::OK();
+}
+}  // namespace
+
+#define REGISTER_VARIANT_ZEROS_LIKE_TYPE(T)   \
+  REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION( \
+      DEVICE_CPU, T, TF_STR(T), ZerosLikeVariantPrimitiveType<T>);
+
+// No zeros_like registered for std::complex<> or Eigen::half objects yet.
+REGISTER_VARIANT_ZEROS_LIKE_TYPE(int);
+REGISTER_VARIANT_ZEROS_LIKE_TYPE(float);
+REGISTER_VARIANT_ZEROS_LIKE_TYPE(double);
+REGISTER_VARIANT_ZEROS_LIKE_TYPE(bool);
+
+#undef REGISTER_VARIANT_ZEROS_LIKE_TYPE
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 389b049fa01..37e54f82c0f 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -19,11 +19,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 
 namespace tensorflow {
 
+class OpKernelContext;
 // A global UnaryVariantOpRegistry is used to hold callback functions
 // for different variant types.  To be used by ShapeOp, RankOp, and
 // SizeOp, decoding, etc.
@@ -32,6 +34,8 @@ class UnaryVariantOpRegistry {
  public:
   typedef std::function<Status(const Variant& v, TensorShape*)> VariantShapeFn;
   typedef std::function<bool(Variant*)> VariantDecodeFn;
+  typedef std::function<Status(OpKernelContext*, const Variant&, Variant*)>
+      VariantZerosLikeFn;
 
   // Add a shape lookup function to the registry.
   void RegisterShapeFn(const string& type_name, const VariantShapeFn& shape_fn);
@@ -46,11 +50,29 @@ class UnaryVariantOpRegistry {
   // Returns nullptr if no decode function was found for the given TypeName.
   VariantDecodeFn* GetDecodeFn(const string& type_name);
 
+  // Add a zeros-like function to the registry.
+  void RegisterZerosLikeFn(const string& device, const string& type_name,
+                           const VariantZerosLikeFn& zeros_like_fn);
+
+  // Returns nullptr if no zeros-like function was found for the given
+  // device and TypeName.
+  VariantZerosLikeFn* GetZerosLikeFn(const string& device,
+                                     const string& type_name);
+
   static UnaryVariantOpRegistry* Global();
 
  private:
   std::unordered_map<string, VariantShapeFn> shape_fns;
   std::unordered_map<string, VariantDecodeFn> decode_fns;
+  // Map std::pair<device, type_name> to function.
+  struct PairHash {
+    template <typename T, typename U>
+    std::size_t operator()(const std::pair<T, U>& x) const {
+      return std::hash<T>()(x.first) ^ std::hash<U>()(x.second);
+    }
+  };
+  std::unordered_map<std::pair<string, string>, VariantZerosLikeFn, PairHash>
+      zeros_like_fns;
 };
 
 // Gets a TensorShape from a Tensor containing a scalar Variant.
@@ -72,6 +94,28 @@ Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape);
 //
 bool DecodeUnaryVariant(Variant* variant);
 
+// Sets *z_out = zeros_like(v).  The variant v must have a registered
+// ZerosLike function for the given Device.  Returns an Internal error
+// if v does not have a registered zeros_like function for this device, or if
+// ZerosLike fails.
+//
+// REQUIRES:
+//   v_out is not null.
+//
+template <typename Device>
+Status CreateZerosLikeVariant(OpKernelContext* ctx, const Variant& v,
+                              Variant* v_out) {
+  const string& device = DeviceName<Device>::value;
+  UnaryVariantOpRegistry::VariantZerosLikeFn* zeros_like_fn =
+      UnaryVariantOpRegistry::Global()->GetZerosLikeFn(device, v.TypeName());
+  if (zeros_like_fn == nullptr) {
+    return errors::Internal(
+        "No unary variant zeros_like function found for Variant type_name: ",
+        v.TypeName(), " for device type: ", device);
+  }
+  return (*zeros_like_fn)(ctx, v, v_out);
+}
+
 namespace variant_op_registry_fn_registration {
 
 template <typename T>
@@ -120,6 +164,34 @@ class UnaryVariantDecodeRegistration {
   }
 };
 
+template <typename T>
+class UnaryVariantZerosLikeRegistration {
+  typedef std::function<Status(OpKernelContext* ctx, const T& t, T* t_out)>
+      LocalVariantZerosLikeFn;
+
+ public:
+  UnaryVariantZerosLikeRegistration(
+      const string& device, const string& type_name,
+      const LocalVariantZerosLikeFn& zeros_like_fn) {
+    auto wrapped_fn = [type_name, zeros_like_fn](OpKernelContext* ctx,
+                                                 const Variant& v,
+                                                 Variant* v_out) -> Status {
+      CHECK_NOTNULL(v_out);
+      *v_out = T();
+      if (v.get<T>() == nullptr) {
+        return errors::Internal(
+            "VariantZerosLikeFn: Could not access object, type_name: ",
+            type_name);
+      }
+      const T& t = *v.get<T>();
+      T* t_out = v_out->get<T>();
+      return zeros_like_fn(ctx, t, t_out);
+    };
+    UnaryVariantOpRegistry::Global()->RegisterZerosLikeFn(device, type_name,
+                                                          wrapped_fn);
+  }
+};
+
 };  // namespace variant_op_registry_fn_registration
 
 // Register a unary shape variant function with the signature:
@@ -151,6 +223,26 @@ class UnaryVariantDecodeRegistration {
       T>                                                                      \
       register_unary_variant_op_decoder_fn_##ctr(type_name)
 
+// Register a unary zeros_like variant function with the signature:
+//    Status ZerosLikeFn(OpKernelContext* ctx, const T& t, T* t_out);
+// to Variants having TypeName type_name, for device string device.
+#define REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION(device, T, type_name, \
+                                                   zeros_like_function)  \
+  REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ_HELPER(                \
+      __COUNTER__, device, T, type_name, zeros_like_function)
+
+#define REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ_HELPER(              \
+    ctr, device, T, type_name, zeros_like_function)                          \
+  REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ(ctr, device, T, type_name, \
+                                                  zeros_like_function)
+
+#define REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION_UNIQ(                \
+    ctr, device, T, type_name, zeros_like_function)                     \
+  static variant_op_registry_fn_registration::                          \
+      UnaryVariantZerosLikeRegistration<T>                              \
+          register_unary_variant_op_decoder_fn_##ctr(device, type_name, \
+                                                     zeros_like_function)
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_VARIANT_OP_REGISTRY_H_
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 86fef53dbe6..4e79180217a 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -15,13 +15,25 @@ limitations under the License.
 
 #include <memory>
 
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include "tensorflow/core/framework/variant_op_registry.h"
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
 namespace {
 
 struct VariantValue {
@@ -33,7 +45,24 @@ struct VariantValue {
     *s = TensorShape({-0xdeadbeef});
     return Status::OK();
   }
+  static Status CPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v,
+                               VariantValue* v_out) {
+    if (v.early_exit) {
+      return errors::InvalidArgument("early exit zeros_like!");
+    }
+    v_out->zeros_like_set = 1;  // CPU
+    return Status::OK();
+  }
+  static Status GPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v,
+                               VariantValue* v_out) {
+    if (v.early_exit) {
+      return errors::InvalidArgument("early exit zeros_like!");
+    }
+    v_out->zeros_like_set = 2;  // GPU
+    return Status::OK();
+  }
   bool early_exit;
+  int zeros_like_set;
 };
 
 REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(VariantValue, "TEST VariantValue",
@@ -41,6 +70,14 @@ REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(VariantValue, "TEST VariantValue",
 
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(VariantValue, "TEST VariantValue");
 
+REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION(DEVICE_CPU, VariantValue,
+                                           "TEST VariantValue",
+                                           VariantValue::CPUZerosLikeFn);
+
+REGISTER_UNARY_VARIANT_ZEROS_LIKE_FUNCTION(DEVICE_GPU, VariantValue,
+                                           "TEST VariantValue",
+                                           VariantValue::GPUZerosLikeFn);
+
 }  // namespace
 
 TEST(VariantOpShapeRegistryTest, TestBasic) {
@@ -101,4 +138,67 @@ TEST(VariantOpDecodeRegistryTest, TestDuplicate) {
                "fjfjfj already registered");
 }
 
+TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
+  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetZerosLikeFn(
+                DEVICE_CPU, "YOU SHALL NOT PASS"),
+            nullptr);
+
+  VariantValue vv_early_exit{true /* early_exit */, 0 /* zeros_like_set */};
+  Variant v = vv_early_exit;
+  Variant v_out = VariantValue();
+
+  OpKernelContext* null_context_pointer = nullptr;
+  Status s0 =
+      CreateZerosLikeVariant<CPUDevice>(null_context_pointer, v, &v_out);
+  EXPECT_FALSE(s0.ok());
+  EXPECT_TRUE(
+      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+
+  VariantValue vv_ok{false /* early_exit */, 0 /* zeros_like_set */};
+  v = vv_ok;
+  TF_EXPECT_OK(
+      CreateZerosLikeVariant<CPUDevice>(null_context_pointer, v, &v_out));
+  VariantValue* vv_out = CHECK_NOTNULL(v_out.get<VariantValue>());
+  EXPECT_EQ(vv_out->zeros_like_set, 1);  // CPU
+}
+
+#if GOOGLE_CUDA
+TEST(VariantOpZerosLikeRegistryTest, TestBasicGPU) {
+  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetZerosLikeFn(
+                DEVICE_GPU, "YOU SHALL NOT PASS"),
+            nullptr);
+
+  VariantValue vv_early_exit{true /* early_exit */, 0 /* zeros_like_set */};
+  Variant v = vv_early_exit;
+  Variant v_out = VariantValue();
+
+  OpKernelContext* null_context_pointer = nullptr;
+  Status s0 =
+      CreateZerosLikeVariant<GPUDevice>(null_context_pointer, v, &v_out);
+  EXPECT_FALSE(s0.ok());
+  EXPECT_TRUE(
+      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+
+  VariantValue vv_ok{false /* early_exit */, 0 /* zeros_like_set */};
+  v = vv_ok;
+  TF_EXPECT_OK(
+      CreateZerosLikeVariant<GPUDevice>(null_context_pointer, v, &v_out));
+  VariantValue* vv_out = CHECK_NOTNULL(v_out.get<VariantValue>());
+  EXPECT_EQ(vv_out->zeros_like_set, 2);  // GPU
+}
+#endif  // GOOGLE_CUDA
+
+TEST(VariantOpZerosLikeRegistryTest, TestDuplicate) {
+  UnaryVariantOpRegistry registry;
+  UnaryVariantOpRegistry::VariantZerosLikeFn f;
+
+  registry.RegisterZerosLikeFn(DEVICE_CPU, "fjfjfj", f);
+  EXPECT_DEATH(registry.RegisterZerosLikeFn(DEVICE_CPU, "fjfjfj", f),
+               "fjfjfj already registered");
+
+  registry.RegisterZerosLikeFn(DEVICE_GPU, "fjfjfj", f);
+  EXPECT_DEATH(registry.RegisterZerosLikeFn(DEVICE_GPU, "fjfjfj", f),
+               "fjfjfj already registered");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 7d938365c5a..a274c799704 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -523,6 +523,17 @@ Status Graph::IsValidNode(const Node* node) const {
   return Status::OK();
 }
 
+Status Graph::IsValidOutputTensor(const Node* node, int idx) const {
+  TF_RETURN_IF_ERROR(IsValidNode(node));
+  if (idx >= node->num_outputs()) {
+    return errors::InvalidArgument("Node '", node->name(), "' (type: '",
+                                   node->op_def().name(),
+                                   "', num of outputs: ", node->num_outputs(),
+                                   ") does not have ", "output ", idx);
+  }
+  return Status::OK();
+}
+
 Node* Graph::AllocateNode(std::shared_ptr<NodeProperties> props,
                           const Node* cost_node) {
   Node* node = nullptr;
@@ -572,7 +583,7 @@ int Graph::InternDeviceName(const string& device_name) {
 }
 
 string Edge::DebugString() const {
-  return strings::Printf("Edge %d %s:%d -> %s:%d", id_, src_->name().c_str(),
+  return strings::Printf("[id=%d %s:%d -> %s:%d]", id_, src_->name().c_str(),
                          src_output_, dst_->name().c_str(), dst_input_);
 }
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 51ede642d27..25875185e47 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -519,6 +519,10 @@ class Graph {
   // Returns OK if `node` is non-null and belongs to this graph
   Status IsValidNode(const Node* node) const;
 
+  // Returns OK if IsValidNode(`node`) and `idx` is less than
+  // node->num_outputs()
+  Status IsValidOutputTensor(const Node* node, int idx) const;
+
   // TODO(josh11b): uint64 hash() const;
 
  private:
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 2f9ceaa3bd0..4c793231974 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -1099,6 +1099,44 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   CHECK_NOTNULL(workspace_tensors);
   CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
 
+  // TODO(nhasabni): Temporary solution to connect filter input of
+  // BackpropInput with the converted filter from Conv2D.
+  bool do_connect_conv2d_backprop_input_filter = false;
+  Node* conv2d_node = nullptr;
+  // Filter node is 2nd input (slot index 1) of Conv2D.
+  int kConv2DFilterInputSlotIdx = 1;
+  int kConv2DBackpropInputFilterInputSlotIdx = 1;
+  int kConv2DFilterOutputSlotIdx = 1;
+  if (old_node->type_string() == csinfo_.conv2d_grad_input) {
+    // We need to find Conv2D node from Conv2DBackpropInput.
+    // For that let's first find filter node that is 2nd input (slot 1)
+    // of BackpropInput.
+    Node* filter_node = nullptr;
+    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    CHECK_NOTNULL(filter_node);
+
+    // Now check which nodes receive from filter_node. Filter feeds as
+    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
+    for (const Edge* e : filter_node->out_edges()) {
+      if (e->dst()->type_string() == csinfo_.mkl_conv2d &&
+          e->dst_input() == kConv2DFilterInputSlotIdx
+          /* filter is 2nd input of Conv2D and _MklConv2D. */) {
+        if (conv2d_node != nullptr) {
+          VLOG(1) << "MklLayoutRewritePass: unusual case of same filter"
+                  << " feeding multiple Conv2D nodes: "
+                  << filter_node->DebugString();
+          // We will not connect filter input of Conv2DBackpropInput
+          // to be safe here.
+          do_connect_conv2d_backprop_input_filter = false;
+          break;
+        } else {
+          conv2d_node = e->dst();
+          do_connect_conv2d_backprop_input_filter = true;
+        }
+      }
+    }
+  }
+
   // Number of input slots to original op
   // Input slots are represented by .Input() calls in REGISTER_OP.
   int old_node_input_slots = old_node->op_def().input_arg_size();
@@ -1122,7 +1160,13 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
-      nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx);
+      } else {
+        nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      }
       iidx++;
       nn_slot_idx++;
     }
@@ -1157,9 +1201,17 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     } else {
       Node* mkl_node = nullptr;
       int mkl_node_output_slot = 0;
-      GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
-                                old_node_inputs[iidx].second,
-                                &mkl_node, &mkl_node_output_slot);
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        GetNodeProducingMklTensor(g, old_node, conv2d_node,
+                                  kConv2DFilterOutputSlotIdx,
+                                  &mkl_node, &mkl_node_output_slot);
+      } else {
+        GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                  old_node_inputs[iidx].second,
+                                  &mkl_node, &mkl_node_output_slot);
+      }
       nb->Input(mkl_node, mkl_node_output_slot);
       iidx++;
       nn_slot_idx++;
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 482e339802f..bd1d74368e5 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -788,7 +788,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
             "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;"
             "A:control->DMT/_0:control;A:control->DMT/_1:control;"
             "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
-            "C:1->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
+            "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
 }
 
 // Conv2D with INT32 which is not supported by Mkl
@@ -917,7 +917,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
             "A:control->DMT/_2:control;A:control->DMT/_3:control;"
             "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
             "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:3;E->H:1;E:1->H:4;F->H:2;F:1->H:5;G->H;"
+            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
             "G:control->DMT/_4:control;H->I:1");
 }
 
@@ -953,7 +953,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
             "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
             "H(_MklConcat);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
             "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:1->H:4;F->H:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
             "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
 }
 
@@ -1023,8 +1023,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
             "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
             "C:control->DMT/_0:control;C:control->DMT/_1:control;"
             "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:5;E->H;E:1->H:3;E:control->DMT/_4:control;F->H:1;"
-            "F:1->H:4;G->H:2;H->I:1");
+            "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:2->H:4;G->H:2;H->I:1");
 }
 
 // ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
@@ -1060,7 +1060,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
             "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
             "H(_MklConcatV2);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
             "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:1->H:3;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
             "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
             "G->H:2;H->I:1");
 }
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index 90bef111648..b01818f7461 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -173,13 +173,13 @@ TEST_F(MklToTfConversionPass, Positive) {
     EXPECT_EQ(DoRunMklToTfConversionPass(),
               "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
               "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:1;C->Mkl2Tf/_0;"
-              "C:1->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3");
+              "C:2->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3");
   }
 }
 
 // MklConv2D followed by MklToTf op followed by Non-Mkl layer.
 // C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for interleaved)
-// C=MklConv2D(A,B,M,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for contiguous)
+// C=MklConv2D(A,B,M,N); D=MklToTf(C:0, C:2) F=Sub(D,E) (for contiguous)
 // MklToTf node should not be inserted again.
 TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
   if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
@@ -226,7 +226,7 @@ TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
         "node { name: 'D' op: '_MklToTf'"
         " attr { key: 'T'                value { type: DT_FLOAT } }"
         " attr { key: 'data_format'      value { s: 'NCHW' } }"
-        " input: ['C:0', 'C:1']}"
+        " input: ['C:0', 'C:2']}"
         "node { name: 'E' op: 'Input'}"
         "node { name: 'F' op: 'Sub'"
         " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -234,7 +234,7 @@ TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
     EXPECT_EQ(DoRunMklToTfConversionPass(),
               "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);"
               "F(Sub);M(_MklInput);N(_MklInput)|"
-              "A->C;B->C:1;C->D;C:1->D:1;D->F;E->F:1;M->C:2;N->C:3");
+              "A->C;B->C:1;C->D;C:2->D:1;D->F;E->F:1;M->C:2;N->C:3");
   }
 }
 
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 0ab6aff250b..1b1c88f2df4 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -396,6 +396,18 @@ Status GraphProperties::InferStatically() {
       }
       input_properties.push_back(properties);
     }
+    for (const auto& edge : node->in_edges()) {
+      if (!edge->src()->IsConstant()) {
+        continue;
+      }
+      const int input_id = edge->dst_input();
+      if (input_id >= input_properties.size()) {
+        continue;
+      }
+      const NodeDef& node = edge->src()->def();
+      const TensorProto& raw_val = node.attr().at("value").tensor();
+      *input_properties[input_id].mutable_value() = raw_val;
+    }
     input_properties_[node->name()] = input_properties;
 
     // TODO(bsteiner): share this code with the input processing above.
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 954c5ead8fc..461e58cf736 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -345,6 +345,15 @@ TEST_F(GraphPropertiesTest, MergeWithoutLoops) {
     EXPECT_EQ(DT_FLOAT, prop.dtype());
     EXPECT_EQ(expected_outputs[i], PropToString(prop));
   }
+
+  // The "Less" node should be fed by 2 int32 scalar constant values.
+  const auto props = properties.GetInputProperties("Less");
+  EXPECT_EQ(2, props.size());
+  for (int i = 0; i < props.size(); ++i) {
+    EXPECT_EQ(DT_INT32, props[i].dtype());
+    EXPECT_TRUE(props[i].has_value());
+    EXPECT_EQ("int32: []", PropToString(props[i]));
+  }
 }
 
 TEST_F(GraphPropertiesTest, WhileLoop) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0893a012047..efc5d7c553a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -259,19 +259,13 @@ cc_library(
 cc_library(
     name = "conv_ops_gpu_hdrs",
     hdrs = ["conv_ops_gpu.h"],
-    deps = [
-        ":eigen_helpers",
-        "//third_party/eigen3",
-    ],
+    deps = ["//third_party/eigen3"],
 )
 
 cc_library(
     name = "gpu_util_hdrs",
     hdrs = ["gpu_utils.h"],
-    deps = [
-        ":eigen_helpers",
-        "//third_party/eigen3",
-    ],
+    deps = ["//third_party/eigen3"],
 )
 
 tf_cc_test(
@@ -2587,14 +2581,17 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "reduction_ops",
+    srcs = ["reduction_ops_gpu_kernels.h"],
     prefix = "reduction_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
     name = "segment_reduction_ops",
     prefix = "segment_reduction_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_cuda([
+        ":cuda_solvers",
+    ]),
 )
 
 tf_kernel_library(
@@ -3070,14 +3067,16 @@ tf_kernel_library(
 tf_kernel_library(
     name = "l2loss_op",
     prefix = "l2loss_op",
+    #srcs = ["reduction_ops_gpu_kernels.h"],
     deps = [
+        ":reduction_ops",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/eigen3",
-    ],
+    ] + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_cuda_cc_test(
@@ -3346,6 +3345,20 @@ tf_kernel_library(
     deps = PARSING_DEPS,
 )
 
+tf_cc_test(
+    name = "parse_tensor_test",
+    srcs = ["parse_tensor_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":parse_tensor_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "string_to_number_op",
     prefix = "string_to_number_op",
@@ -4668,6 +4681,8 @@ filegroup(
             "whole_file_read_ops.*",
             "sample_distorted_bounding_box_op.*",
             "ctc_loss_op.*",
+            "summary_interface.*",
+            "summary_kernels.*",
             "spectrogram_convert_test_data.cc",
             "sql_dataset_ops.cc",
             # Excluded due to experimental status:
@@ -5957,6 +5972,43 @@ tf_kernel_library(
     ],
 )
 
+cc_library(
+    name = "summary_interface",
+    srcs = ["summary_interface.cc"],
+    hdrs = ["summary_interface.h"],
+    deps = [
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "summary_interface_test",
+    srcs = ["summary_interface_test.cc"],
+    deps = [
+        ":summary_interface",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_kernel_library(
+    name = "summary_kernels",
+    srcs = ["summary_kernels.cc"],
+    deps = [
+        ":summary_interface",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:summary_ops_op_lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index e07ca5e0c4c..ddc2d457b0e 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -142,9 +142,9 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
                                            int group_size) {
   // Initialize the shared memory.
   typedef typename AccumulatorType<T>::type AccT;
-  __shared__ AccT s_data[32];
-  int32 s_data_size = sizeof(s_data) / sizeof(T);
-  for (int32 index = threadIdx.x; index < s_data_size; index += blockDim.x) {
+  const int32 kSDataSize = 32;
+  __shared__ AccT s_data[kSDataSize];
+  for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
     s_data[index] = AccT(0);
   }
   __syncthreads();
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index b4b37dd4b8e..cdc11452827 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include "tensorflow/core/kernels/constant_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -26,13 +30,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -40,9 +45,8 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx)
     : OpKernel(ctx), tensor_(ctx->output_type(0)) {
   const TensorProto* proto = nullptr;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
-  OP_REQUIRES_OK(ctx,
-                 ctx->device()->MakeTensorFromProto(
-                     *proto, AllocatorAttributes(), &tensor_));
+  OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
+                          *proto, AllocatorAttributes(), &tensor_));
   OP_REQUIRES(
       ctx, ctx->output_type(0) == tensor_.dtype(),
       errors::InvalidArgument("Type mismatch between value (",
@@ -85,9 +89,9 @@ REGISTER_KERNEL(GPU, bool);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(D, TYPE)                                  \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"),  \
+#define REGISTER_SYCL_KERNEL(D, TYPE)                                 \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \
       ConstantOp);
 REGISTER_SYCL_KERNEL(SYCL, float);
 REGISTER_SYCL_KERNEL(SYCL, double);
@@ -194,18 +198,18 @@ struct FillFunctor<SYCLDevice, T> {
   void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
                   typename TTypes<T>::ConstScalar in) {
 #if !defined(EIGEN_HAS_INDEX_LIST)
-  Eigen::array<int, 1> rank1{1};
+    Eigen::array<int, 1> rank1{1};
 #else
-  Eigen::IndexList<Eigen::type2index<1>> rank1;
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
 #endif
-  const int size  = out.dimension(0);
-  Eigen::array<int, 1> broadcast_dims{size};
+    const int size = out.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
 
-  To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
+    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
   }
 };
-}
-#endif // TENSORFLOW_USE_SYCL
+}  // namespace functor
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNEL(D, TYPE)                         \
   REGISTER_KERNEL_BUILDER(Name("Fill")                   \
@@ -273,11 +277,23 @@ class ZerosLikeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
-                            {0}, 0, input.shape(), &out));
-    functor::SetZeroFunctor<Device, T> f;
-    f(ctx->eigen_device<Device>(), out->flat<T>());
+    const Device& d = ctx->eigen_device<Device>();
+    if (std::is_same<T, Variant>::value) {
+      OP_REQUIRES(ctx, input.dims() == 0,
+                  errors::InvalidArgument(
+                      "ZerosLike of non-unary Variant not supported."));
+      const Variant& v = input.scalar<Variant>()();
+      Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
+      Variant* out_v = &(out.scalar<Variant>()());
+      OP_REQUIRES_OK(ctx, CreateZerosLikeVariant<Device>(ctx, v, out_v));
+      ctx->set_output(0, out);
+    } else {
+      Tensor* out = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {0}, 0, input.shape(), &out));
+      functor::SetZeroFunctor<Device, T> f;
+      f(d, out->flat<T>());
+    }
   }
 };
 
@@ -288,6 +304,7 @@ class ZerosLikeOp : public OpKernel {
 
 #define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU)
 TF_CALL_POD_STRING_TYPES(REGISTER_CPU);
+REGISTER_CPU(Variant);
 #undef REGISTER_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -315,6 +332,14 @@ REGISTER_KERNEL_BUILDER(Name("ZerosLike")
                             .TypeConstraint<int32>("T")
                             .HostMemory("y"),
                         ZerosLikeOp<CPUDevice, int32>);
+// TODO(ebrevdo): Once rendezvous has been properly set up for
+// Variants, we'll no longer need a HostMemory attribute for this case.
+REGISTER_KERNEL_BUILDER(Name("ZerosLike")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Variant>("T")
+                            .HostMemory("x")
+                            .HostMemory("y"),
+                        ZerosLikeOp<GPUDevice, Variant>);
 #endif  // GOOGLE_CUDA
 
 #undef REGISTER_KERNEL
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 4bb0b7f3b41..8de8f1b2650 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -225,13 +225,13 @@ struct PadInput {
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, IndexType>::Tensor out,
                   TensorFormat format) {
-    Eigen::array<std::pair<IndexType, IndexType>, NDIMS> padding;
-    padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = std::make_pair(0, 0);
+    Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
+    padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
     for (int i = 0; i < NDIMS - 2; ++i) {
-      padding[GetTensorDimIndex<NDIMS - 2>(format, '0' + i)] =
-          std::make_pair(padding_left[i], padding_right[i]);
+      padding[GetTensorDimIndex<NDIMS - 2>(format, '0' + i)] = {
+          padding_left[i], padding_right[i]};
     }
-    padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = std::make_pair(0, 0);
+    padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = {0, 0};
     out.device(d) = in.pad(padding);
   }
 };
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 65514937f4e..8eb705b2e5f 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -91,6 +91,20 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
+template <typename T>
+struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& input,
+                  int row_stride, int col_stride, const Padding& padding,
+                  Tensor* filter_backprop, TensorFormat data_format) {
+    const CPUDevice& d = ctx->eigen_device<CPUDevice>();
+    functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
+        d, filter_backprop->tensor<T, 4>(), input.tensor<T, 4>(),
+        out_backprop.tensor<T, 4>(), filter_backprop->dim_size(0),
+        filter_backprop->dim_size(1), row_stride, col_stride);
+  }
+};
+
 #ifdef TENSORFLOW_USE_LIBXSMM
 template <typename Device, class T>
 struct LaunchXsmmBackwardFilter {
@@ -237,11 +251,9 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     }
 #endif
 
-    functor::SpatialConvolutionBackwardKernel<Device, T>()(
-        context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
-        input.tensor<T, 4>(), out_backprop.tensor<T, 4>(),
-        dims.spatial_dims[0].filter_size, dims.spatial_dims[1].filter_size,
-        dims.spatial_dims[0].stride, dims.spatial_dims[1].stride);
+    LaunchConv2DBackpropInputOp<Device, T>()(
+        context, false, false, out_backprop, input, dims.spatial_dims[0].stride,
+        dims.spatial_dims[1].stride, padding_, filter_backprop, data_format_);
   }
 
  private:
@@ -495,15 +507,10 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
-    cudnn_disable_conv_1x1_optimization_ = CudnnDisableConv1x1Optimization();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
   void Compute(OpKernelContext* context) override {
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmType;
-    using perftools::gputools::dnn::ProfileResult;
-    using perftools::gputools::dnn::kDefaultAlgorithm;
     const Tensor& input = context->input(0);
     const Tensor& filter_sizes = context->input(1);
     const Tensor& out_backprop = context->input(2);
@@ -512,340 +519,22 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
         errors::InvalidArgument(
             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
             filter_sizes.dims()));
-    const TensorShape& input_shape = input.shape();
     TensorShape filter_shape;
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 filter_sizes.vec<int32>(), &filter_shape));
 
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
-                       "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
-                       input.shape(), filter_shape, out_backprop.shape(),
-                       strides_, padding_, data_format_, &dims));
-
     Tensor* filter_backprop = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, filter_shape, &filter_backprop));
 
-    const int padding_rows =
-        (padding_ == VALID)
-            ? 0
-            : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                       dims.spatial_dims[0].stride +
-                                   dims.spatial_dims[0].filter_size -
-                                   dims.spatial_dims[0].input_size);
-    const int padding_cols =
-        (padding_ == VALID)
-            ? 0
-            : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                       dims.spatial_dims[1].stride +
-                                   dims.spatial_dims[1].filter_size -
-                                   dims.spatial_dims[1].input_size);
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
 
-    // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
-    // calling it when that is true. Remove this check when (if?) cuDNN starts
-    // supporting different padding.
-    bool rows_odd = (padding_rows % 2 != 0);
-    bool cols_odd = (padding_cols % 2 != 0);
-
-    auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-    if (!use_cudnn_) {
-      context->SetStatus(errors::Unimplemented(
-          "Conv2DBackprop for GPU is not currently supported "
-          "without cudnn"));
-      return;
-    }
-
-    if (!cudnn_disable_conv_1x1_optimization_ &&
-        dims.spatial_dims[0].filter_size == 1 &&
-        dims.spatial_dims[1].filter_size == 1 &&
-        dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-        data_format_ == FORMAT_NHWC) {
-      const uint64 m = dims.in_depth;
-      const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
-                       dims.spatial_dims[1].input_size;
-      const uint64 n = dims.out_depth;
-
-      // The shape of output backprop is
-      //   [batch, out_rows, out_cols, out_depth]
-      //   From cublas's perspective, it is: n x k
-      auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-
-      // The shape of input is
-      //   [batch, in_rows, in_cols, in_depth],
-      //   From cublas's perspective, it is: m x k
-      auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                  input.template flat<T>().size());
-
-      // the shape of the filter backprop from the conv_2d should be
-      //   [1, 1, in_depth, out_depth]
-      //   From cublas's perspective, it is: n x m
-      auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                  filter_backprop->template flat<T>().size());
-
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
-              .ok();
-      if (!blas_launch_status) {
-        context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
-                                            ", n=", n, ", k=", k));
-      }
-      return;
-    } else if (dims.spatial_dims[0].filter_size ==
-                   dims.spatial_dims[0].input_size &&
-               dims.spatial_dims[1].filter_size ==
-                   dims.spatial_dims[1].input_size &&
-               padding_ == VALID && data_format_ == FORMAT_NHWC) {
-      // The input data and filter have the same height/width, so call cublas
-      // directly.
-      const uint64 m = dims.spatial_dims[0].input_size *
-                       dims.spatial_dims[1].input_size * dims.in_depth;
-      const uint64 k = dims.batch_size;
-      const uint64 n = dims.out_depth;
-
-      auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                  input.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
-                                  filter_backprop->template flat<T>().size());
-
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
-                             perftools::gputools::blas::Transpose::kTranspose,
-                             n, m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
-              .ok();
-      if (!blas_launch_status) {
-        context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
-                                            ", n=", n, ", k=", k));
-      }
-      return;
-    }
-
-    Tensor compatible_input;
-    if (rows_odd || cols_odd) {
-      // If a padding dimension is odd, we have one more element on the right
-      // side or the bottom side. This is unsupported in cudnn. Therefore,
-      // we pad that extra element and make it compatible.
-      OP_REQUIRES_OK(
-          context,
-          context->allocate_temp(
-              DataTypeToEnum<T>::value,
-              ShapeFromFormat(data_format_, dims.batch_size,
-                              dims.spatial_dims[0].input_size + rows_odd,
-                              dims.spatial_dims[1].input_size + cols_odd,
-                              dims.in_depth),
-              &compatible_input));
-
-      functor::PadInput<GPUDevice, T, int, 4>()(
-          context->template eigen_device<GPUDevice>(),
-          To32Bit(input.tensor<T, 4>()), {{0, 0}}, {{rows_odd, cols_odd}},
-          To32Bit(compatible_input.tensor<T, 4>()), data_format_);
-    } else {
-      compatible_input = input;
-    }
-
-    CHECK(padding_rows >= 0 && padding_cols >= 0)
-        << "Negative row or col paddings: (" << padding_rows << ", "
-        << padding_cols << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc;
-    input_desc.set_count(dims.batch_size)
-        .set_height(GetTensorDim(compatible_input, data_format_, 'H'))
-        .set_width(GetTensorDim(compatible_input, data_format_, 'W'))
-        .set_feature_map_count(dims.in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc;
-    output_desc.set_count(dims.batch_size)
-        .set_height(dims.spatial_dims[0].output_size)
-        .set_width(dims.spatial_dims[1].output_size)
-        .set_feature_map_count(dims.out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc;
-    filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
-        .set_input_filter_width(dims.spatial_dims[1].filter_size)
-        .set_input_feature_map_count(dims.in_depth)
-        .set_output_feature_map_count(dims.out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-    conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
-        .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-        .set_zero_padding_height(padding_rows / 2)
-        .set_zero_padding_width(padding_cols / 2);
-
-    // NOTE(zhengxq):
-    // cuDNN only supports the following layouts :
-    // Input  : B x D x R x C
-    // Filter : OD x ID x R x C
-    // Whereas, we have
-    // Input  : B x R x C x D
-    // Filter : R x C x ID x OD
-    // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
-    // The first TransformDepth performs
-    // (B x R x C x D) => (B x D x R x C).
-    // Since the tensor returned from cuDNN is B x D x R x C also,
-    // the second TransformDepth performs
-    // (B x D x R x C) => (B x R x C x D).
-
-    Tensor pre_transformed_filter_backprop;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                TensorShape({dims.out_depth, dims.in_depth,
-                                             dims.spatial_dims[0].filter_size,
-                                             dims.spatial_dims[1].filter_size}),
-                                &pre_transformed_filter_backprop));
-
-    Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC) {
-      TensorShape nchw_shape = ShapeFromFormat(
-          FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size,
-          dims.spatial_dims[1].output_size, dims.out_depth);
-      if (dims.out_depth > 1) {
-        OP_REQUIRES_OK(context, context->allocate_temp(
-                                    DataTypeToEnum<T>::value, nchw_shape,
-                                    &transformed_out_backprop));
-        functor::NHWCToNCHW<Device, T, 4>()(
-            context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
-            transformed_out_backprop.tensor<T, 4>());
-      } else {
-        // If depth <= 1, just reshape.
-        CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
-      }
-    } else {
-      transformed_out_backprop = out_backprop;
-    }
-
-    Tensor transformed_input;
-    if (data_format_ == FORMAT_NHWC) {
-      TensorShape nchw_shape = ShapeFromFormat(
-          FORMAT_NCHW, GetTensorDim(compatible_input, data_format_, 'N'),
-          GetTensorDim(compatible_input, data_format_, 'H'),
-          GetTensorDim(compatible_input, data_format_, 'W'),
-          GetTensorDim(compatible_input, data_format_, 'C'));
-      if (nchw_shape.dim_size(1) > 1) {
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DataTypeToEnum<T>::value,
-                                              nchw_shape, &transformed_input));
-        functor::NHWCToNCHW<Device, T, 4>()(
-            context->eigen_device<Device>(),
-            const_cast<const Tensor&>(compatible_input).tensor<T, 4>(),
-            transformed_input.tensor<T, 4>());
-      } else {
-        // If depth <= 1, just reshape.
-        CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
-      }
-    } else {
-      transformed_input = compatible_input;
-    }
-
-    auto out_backprop_ptr =
-        AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
-                       transformed_out_backprop.template flat<T>().size());
-    auto filter_backprop_ptr = AsDeviceMemory(
-        pre_transformed_filter_backprop.template flat<T>().data(),
-        pre_transformed_filter_backprop.template flat<T>().size());
-    auto input_ptr =
-        AsDeviceMemory(transformed_input.template flat<T>().data(),
-                       transformed_input.template flat<T>().size());
-
-    static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit(
-        "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
-        );
-    int device_id = stream->parent()->device_ordinal();
-    DataType dtype = input.dtype();
-    ConvParameters conv_parameters = {
-        dims.batch_size,                       // batch
-        dims.in_depth,                         // in_depths
-        {{input_desc.height(),                 // in_rows
-          input_desc.width()}},                // in_cols
-        dims.out_depth,                        // out_depths
-        {{dims.spatial_dims[0].filter_size,    // filter_rows
-          dims.spatial_dims[1].filter_size}},  // filter_cols
-        {{dims.spatial_dims[0].stride,         // stride_rows
-          dims.spatial_dims[1].stride}},       // stride_cols
-        {{padding_rows,                        // padding_rows
-          padding_cols}},                      // padding_cols
-        dtype,                                 // tensor datatype
-        device_id,                             // device_id
-    };
-    AlgorithmConfig algorithm_config;
-    if (cudnn_use_autotune_ && !AutoTuneConvBwdFilter::GetInstance()->Find(
-                                   conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmType> algorithms;
-      CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
-      ProfileResult best_result;
-      ProfileResult best_result_no_scratch;
-      for (auto profile_algorithm : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        CudnnScratchAllocator scratch_allocator(
-            ConvolveBackwardFilterScratchSize, context);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardFilterWithAlgorithm(
-                    input_desc, input_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, filter_desc, &filter_backprop_ptr,
-                    &scratch_allocator, AlgorithmConfig(profile_algorithm),
-                    &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
-            }
-          }
-        }
-      }
-      OP_REQUIRES(context,
-                  best_result.is_valid() || best_result_no_scratch.is_valid(),
-                  errors::NotFound("No algorithm worked!"));
-      if (best_result.is_valid()) {
-        algorithm_config.set_algorithm(best_result.algorithm());
-      }
-      if (best_result_no_scratch.is_valid()) {
-        algorithm_config.set_algorithm_no_scratch(
-            best_result_no_scratch.algorithm());
-      }
-      AutoTuneConvBwdFilter::GetInstance()->Insert(conv_parameters,
-                                                   algorithm_config);
-    }
-    CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                            context);
-    bool cudnn_launch_status =
-        stream
-            ->ThenConvolveBackwardFilterWithAlgorithm(
-                input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
-                filter_desc, &filter_backprop_ptr, &scratch_allocator,
-                algorithm_config, nullptr)
-            .ok();
-
-    if (!cudnn_launch_status) {
-      context->SetStatus(errors::Internal(
-          "cuDNN Backward Filter function launch failure : input shape(",
-          input_shape.DebugString(), ") filter shape(",
-          filter_shape.DebugString(), ")"));
-      return;
-    }
-
-    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-    functor::ReverseTransformFilter<Device, T, 4>()(
-        context->eigen_device<Device>(),
-        toConstTensor(pre_transformed_filter_backprop).template tensor<T, 4>(),
-        filter_backprop->tensor<T, 4>());
+    launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
+              stride_rows, stride_cols, padding_, filter_backprop,
+              data_format_);
   }
 
  private:
@@ -853,12 +542,351 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
   Padding padding_;
   bool use_cudnn_;
   TensorFormat data_format_;
+  LaunchConv2DBackpropFilterOp<Device, T> launcher_;
   bool cudnn_use_autotune_;
-  bool cudnn_disable_conv_1x1_optimization_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropFilterOp);
 };
 
+template <typename T>
+void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& input, int row_stride,
+    int col_stride, const Padding& padding, Tensor* filter_backprop,
+    TensorFormat data_format) {
+  using perftools::gputools::dnn::AlgorithmConfig;
+  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::ProfileResult;
+
+  std::vector<int32> strides(4, 1);
+  strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
+  strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
+  TensorShape filter_shape = filter_backprop->shape();
+
+  ConvBackpropDimensions dims;
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+                          "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
+                          input.shape(), filter_shape, out_backprop.shape(),
+                          strides, padding, data_format, &dims));
+
+  const int padding_rows =
+      (padding == VALID)
+          ? 0
+          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
+                                     dims.spatial_dims[0].stride +
+                                 dims.spatial_dims[0].filter_size -
+                                 dims.spatial_dims[0].input_size);
+  const int padding_cols =
+      (padding == VALID)
+          ? 0
+          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
+                                     dims.spatial_dims[1].stride +
+                                 dims.spatial_dims[1].filter_size -
+                                 dims.spatial_dims[1].input_size);
+
+  // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
+  // calling it when that is true. Remove this check when (if?) cuDNN starts
+  // supporting different padding.
+  bool rows_odd = (padding_rows % 2 != 0);
+  bool cols_odd = (padding_cols % 2 != 0);
+
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+  if (!use_cudnn) {
+    ctx->SetStatus(errors::Unimplemented(
+        "Conv2DBackprop for GPU is not currently supported "
+        "without cudnn"));
+    return;
+  }
+
+  bool cudnn_disable_conv_1x1_optimization_ = CudnnDisableConv1x1Optimization();
+  if (!cudnn_disable_conv_1x1_optimization_ &&
+      dims.spatial_dims[0].filter_size == 1 &&
+      dims.spatial_dims[1].filter_size == 1 &&
+      dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
+      data_format == FORMAT_NHWC) {
+    const uint64 m = dims.in_depth;
+    const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
+                     dims.spatial_dims[1].input_size;
+    const uint64 n = dims.out_depth;
+
+    // The shape of output backprop is
+    //   [batch, out_rows, out_cols, out_depth]
+    //   From cublas's perspective, it is: n x k
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+
+    // The shape of input is
+    //   [batch, in_rows, in_cols, in_depth],
+    //   From cublas's perspective, it is: m x k
+    auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+
+    // the shape of the filter backprop from the conv_2d should be
+    //   [1, 1, in_depth, out_depth]
+    //   From cublas's perspective, it is: n x m
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
+
+    bool blas_launch_status =
+        stream
+            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
+                           perftools::gputools::blas::Transpose::kTranspose, n,
+                           m, k, 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
+            .ok();
+    if (!blas_launch_status) {
+      ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                      ", n=", n, ", k=", k));
+    }
+    return;
+  } else if (dims.spatial_dims[0].filter_size ==
+                 dims.spatial_dims[0].input_size &&
+             dims.spatial_dims[1].filter_size ==
+                 dims.spatial_dims[1].input_size &&
+             padding == VALID && data_format == FORMAT_NHWC) {
+    // The input data and filter have the same height/width, so call cublas
+    // directly.
+    const uint64 m = dims.spatial_dims[0].input_size *
+                     dims.spatial_dims[1].input_size * dims.in_depth;
+    const uint64 k = dims.batch_size;
+    const uint64 n = dims.out_depth;
+
+    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                input.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
+                                filter_backprop->template flat<T>().size());
+
+    bool blas_launch_status =
+        stream
+            ->ThenBlasGemm(perftools::gputools::blas::Transpose::kNoTranspose,
+                           perftools::gputools::blas::Transpose::kTranspose, n,
+                           m, k, 1.0f, b_ptr, n, a_ptr, m, 0.0f, &c_ptr, n)
+            .ok();
+    if (!blas_launch_status) {
+      ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                      ", n=", n, ", k=", k));
+    }
+    return;
+  }
+
+  Tensor compatible_input;
+  if (rows_odd || cols_odd) {
+    // If a padding dimension is odd, we have one more element on the right
+    // side or the bottom side. This is unsupported in cudnn. Therefore,
+    // we pad that extra element and make it compatible.
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(
+                 DataTypeToEnum<T>::value,
+                 ShapeFromFormat(data_format, dims.batch_size,
+                                 dims.spatial_dims[0].input_size + rows_odd,
+                                 dims.spatial_dims[1].input_size + cols_odd,
+                                 dims.in_depth),
+                 &compatible_input));
+
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        ctx->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 4>()),
+        {{0, 0}}, {{rows_odd, cols_odd}},
+        To32Bit(compatible_input.tensor<T, 4>()), data_format);
+  } else {
+    compatible_input = input;
+  }
+
+  CHECK(padding_rows >= 0 && padding_cols >= 0)
+      << "Negative row or col paddings: (" << padding_rows << ", "
+      << padding_cols << ")";
+  perftools::gputools::dnn::BatchDescriptor input_desc;
+  input_desc.set_count(dims.batch_size)
+      .set_height(GetTensorDim(compatible_input, data_format, 'H'))
+      .set_width(GetTensorDim(compatible_input, data_format, 'W'))
+      .set_feature_map_count(dims.in_depth)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::BatchDescriptor output_desc;
+  output_desc.set_count(dims.batch_size)
+      .set_height(dims.spatial_dims[0].output_size)
+      .set_width(dims.spatial_dims[1].output_size)
+      .set_feature_map_count(dims.out_depth)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::FilterDescriptor filter_desc;
+  filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
+      .set_input_filter_width(dims.spatial_dims[1].filter_size)
+      .set_input_feature_map_count(dims.in_depth)
+      .set_output_feature_map_count(dims.out_depth);
+  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+      .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
+      .set_zero_padding_height(padding_rows / 2)
+      .set_zero_padding_width(padding_cols / 2);
+
+  // NOTE(zhengxq):
+  // cuDNN only supports the following layouts :
+  // Input  : B x D x R x C
+  // Filter : OD x ID x R x C
+  // Whereas, we have
+  // Input  : B x R x C x D
+  // Filter : R x C x ID x OD
+  // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
+  // The first TransformDepth performs
+  // (B x R x C x D) => (B x D x R x C).
+  // Since the tensor returned from cuDNN is B x D x R x C also,
+  // the second TransformDepth performs
+  // (B x D x R x C) => (B x R x C x D).
+
+  Tensor pre_transformed_filter_backprop;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                              TensorShape({dims.out_depth, dims.in_depth,
+                                           dims.spatial_dims[0].filter_size,
+                                           dims.spatial_dims[1].filter_size}),
+                              &pre_transformed_filter_backprop));
+
+  Tensor transformed_out_backprop;
+  if (data_format == FORMAT_NHWC) {
+    TensorShape nchw_shape = ShapeFromFormat(
+        FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size,
+        dims.spatial_dims[1].output_size, dims.out_depth);
+    if (dims.out_depth > 1) {
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                        &transformed_out_backprop));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
+          transformed_out_backprop.tensor<T, 4>());
+    } else {
+      // If depth <= 1, just reshape.
+      CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+    }
+  } else {
+    transformed_out_backprop = out_backprop;
+  }
+
+  Tensor transformed_input;
+  if (data_format == FORMAT_NHWC) {
+    TensorShape nchw_shape = ShapeFromFormat(
+        FORMAT_NCHW, GetTensorDim(compatible_input, data_format, 'N'),
+        GetTensorDim(compatible_input, data_format, 'H'),
+        GetTensorDim(compatible_input, data_format, 'W'),
+        GetTensorDim(compatible_input, data_format, 'C'));
+    if (nchw_shape.dim_size(1) > 1) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             nchw_shape, &transformed_input));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(compatible_input).tensor<T, 4>(),
+          transformed_input.tensor<T, 4>());
+    } else {
+      // If depth <= 1, just reshape.
+      CHECK(transformed_input.CopyFrom(compatible_input, nchw_shape));
+    }
+  } else {
+    transformed_input = compatible_input;
+  }
+
+  auto out_backprop_ptr =
+      AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                     transformed_out_backprop.template flat<T>().size());
+  auto filter_backprop_ptr =
+      AsDeviceMemory(pre_transformed_filter_backprop.template flat<T>().data(),
+                     pre_transformed_filter_backprop.template flat<T>().size());
+  auto input_ptr = AsDeviceMemory(transformed_input.template flat<T>().data(),
+                                  transformed_input.template flat<T>().size());
+
+  static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit(
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
+  );
+  int device_id = stream->parent()->device_ordinal();
+  DataType dtype = input.dtype();
+  ConvParameters conv_parameters = {
+      dims.batch_size,                       // batch
+      dims.in_depth,                         // in_depths
+      {{input_desc.height(),                 // in_rows
+        input_desc.width()}},                // in_cols
+      dims.out_depth,                        // out_depths
+      {{dims.spatial_dims[0].filter_size,    // filter_rows
+        dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].stride,         // stride_rows
+        dims.spatial_dims[1].stride}},       // stride_cols
+      {{padding_rows,                        // padding_rows
+        padding_cols}},                      // padding_cols
+      dtype,                                 // tensor datatype
+      device_id,                             // device_id
+  };
+  AlgorithmConfig algorithm_config;
+  if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
+                                conv_parameters, &algorithm_config)) {
+    std::vector<AlgorithmType> algorithms;
+    CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+    ProfileResult best_result;
+    ProfileResult best_result_no_scratch;
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                              ctx);
+      ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenConvolveBackwardFilterWithAlgorithm(
+                  input_desc, input_ptr, output_desc, out_backprop_ptr,
+                  conv_desc, filter_desc, &filter_backprop_ptr,
+                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                  &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
+          }
+        }
+      }
+    }
+    OP_REQUIRES(ctx,
+                best_result.is_valid() || best_result_no_scratch.is_valid(),
+                errors::NotFound("No algorithm worked!"));
+    if (best_result.is_valid()) {
+      algorithm_config.set_algorithm(best_result.algorithm());
+    }
+    if (best_result_no_scratch.is_valid()) {
+      algorithm_config.set_algorithm_no_scratch(
+          best_result_no_scratch.algorithm());
+    }
+    AutoTuneConvBwdFilter::GetInstance()->Insert(conv_parameters,
+                                                 algorithm_config);
+  }
+  CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                          ctx);
+  bool cudnn_launch_status =
+      stream
+          ->ThenConvolveBackwardFilterWithAlgorithm(
+              input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
+              filter_desc, &filter_backprop_ptr, &scratch_allocator,
+              algorithm_config, nullptr)
+          .ok();
+
+  if (!cudnn_launch_status) {
+    ctx->SetStatus(errors::Internal(
+        "cuDNN Backward Filter function launch failure : input shape(",
+        input.shape().DebugString(), ") filter shape(",
+        filter_shape.DebugString(), ")"));
+    return;
+  }
+
+  auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+  functor::ReverseTransformFilter<GPUDevice, T, 4>()(
+      ctx->eigen_device<GPUDevice>(),
+      toConstTensor(pre_transformed_filter_backprop).template tensor<T, 4>(),
+      filter_backprop->tensor<T, 4>());
+}
+
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                              \
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index a5a9549a2f9..ce561aa99c2 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -97,29 +97,17 @@ typedef Eigen::GpuDevice GPUDevice;
 // for CPU for now since nvcc times out when trying to compile them.
 // TODO(yangke): enable them for GPUs when we have a faster compiler.
 
-template <typename Device, class T>
-struct LaunchBackwardInputConvolution {
-  bool operator()(OpKernelContext* context, const Device&,
-                  typename TTypes<T, 4>::Tensor,
-                  typename TTypes<T, 4>::ConstTensor,
-                  typename TTypes<T, 4>::ConstTensor, int, int, int, int,
-                  TensorFormat) const {
-    return false;
-  }
-};
-
-template <>
-struct LaunchBackwardInputConvolution<CPUDevice, float> {
-  bool operator()(OpKernelContext* context, const CPUDevice& d,
-                  typename TTypes<float, 4>::Tensor input_backward,
-                  typename TTypes<float, 4>::ConstTensor kernel,
-                  typename TTypes<float, 4>::ConstTensor output_backward,
-                  int input_rows, int input_cols, int row_stride,
-                  int col_stride, TensorFormat data_format) const {
-    functor::SpatialConvolutionBackwardInput<CPUDevice, float>()(
-        d, input_backward, kernel, output_backward, input_rows, input_cols,
-        row_stride, col_stride);
-    return true;
+template <typename T>
+struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& filter,
+                  int row_stride, int col_stride, const Padding& padding,
+                  Tensor* in_backprop, TensorFormat data_format) {
+    const CPUDevice& d = ctx->eigen_device<CPUDevice>();
+    functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
+        d, in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+        out_backprop.tensor<T, 4>(), in_backprop->dim_size(1),
+        in_backprop->dim_size(2), row_stride, col_stride);
   }
 };
 
@@ -268,11 +256,10 @@ class Conv2DFastBackpropInputOp : public OpKernel {
     }
 #endif
 
-    LaunchBackwardInputConvolution<Device, T>()(
-        context, context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
-        filter.tensor<T, 4>(), out_backprop.tensor<T, 4>(),
-        dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
-        dims.spatial_dims[0].stride, dims.spatial_dims[1].stride, data_format_);
+    LaunchConv2DBackpropInputOp<Device, T>()(
+        context, false, false, out_backprop, filter,
+        dims.spatial_dims[0].stride, dims.spatial_dims[1].stride, padding_,
+        in_backprop, data_format_);
   }
 
  private:
@@ -600,10 +587,6 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
-    using perftools::gputools::dnn::AlgorithmConfig;
-    using perftools::gputools::dnn::AlgorithmType;
-    using perftools::gputools::dnn::ProfileResult;
-    using perftools::gputools::dnn::kDefaultAlgorithm;
     const Tensor& input_sizes = context->input(0);
     const Tensor& filter = context->input(1);
     const Tensor& out_backprop = context->input(2);
@@ -615,340 +598,18 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     TensorShape input_shape;
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
                                 input_sizes.vec<int32>(), &input_shape));
-    const TensorShape& filter_shape = filter.shape();
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(
-        context, ConvBackpropComputeDimensions(
-                     "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
-                     input_shape, filter_shape, out_backprop.shape(), strides_,
-                     padding_, data_format_, &dims));
 
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
 
-    const int padding_rows =
-        (padding_ == VALID)
-            ? 0
-            : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                       dims.spatial_dims[0].stride +
-                                   dims.spatial_dims[0].filter_size -
-                                   dims.spatial_dims[0].input_size);
-    const int padding_cols =
-        (padding_ == VALID)
-            ? 0
-            : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                       dims.spatial_dims[1].stride +
-                                   dims.spatial_dims[1].filter_size -
-                                   dims.spatial_dims[1].input_size);
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
+    const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
 
-    // TODO(keveman): cuDNN only supports equal padding on both sides, so only
-    // calling it when that is true. Remove this check when (if?) cuDNN starts
-    // supporting different padding.
-    bool rows_odd = (padding_rows % 2 != 0);
-    bool cols_odd = (padding_cols % 2 != 0);
-
-    auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-    if (!use_cudnn_) {
-      context->SetStatus(errors::Unimplemented(
-          "Conv2DBackpropInput for GPU is not currently supported "
-          "without cudnn"));
-      return;
-    }
-
-    if (dims.spatial_dims[0].filter_size == 1 &&
-        dims.spatial_dims[1].filter_size == 1 &&
-        dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-        data_format_ == FORMAT_NHWC) {
-      // 1x1 filter, so call cublas directly.
-      const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
-                       dims.spatial_dims[1].input_size;
-      const uint64 k = dims.out_depth;
-      const uint64 n = dims.in_depth;
-
-      auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
-                                  filter.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
-                                  in_backprop->template flat<T>().size());
-
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
-
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k,
-                             a_ptr, k, 0.0f, &c_ptr, n)
-              .ok();
-      if (!blas_launch_status) {
-        context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
-                                            ", n=", n, ", k=", k));
-      }
-      return;
-    } else if (dims.spatial_dims[0].filter_size ==
-                   dims.spatial_dims[0].input_size &&
-               dims.spatial_dims[1].filter_size ==
-                   dims.spatial_dims[1].input_size &&
-               padding_ == VALID && data_format_ == FORMAT_NHWC) {
-      // The input data and filter have the same height/width, so call cublas
-      // directly.
-      const uint64 m = dims.batch_size;
-      const uint64 k = dims.out_depth;
-      const uint64 n = dims.spatial_dims[0].input_size *
-                       dims.spatial_dims[1].input_size * dims.in_depth;
-
-      auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
-                                  out_backprop.template flat<T>().size());
-      auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
-                                  filter.template flat<T>().size());
-      auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
-                                  in_backprop->template flat<T>().size());
-
-      auto transpose = perftools::gputools::blas::Transpose::kTranspose;
-      auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
-
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k,
-                             a_ptr, k, 0.0f, &c_ptr, n)
-              .ok();
-      if (!blas_launch_status) {
-        context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
-                                            ", n=", n, ", k=", k));
-      }
-      return;
-    }
-
-    TensorShape compatible_input_shape;
-    if (rows_odd || cols_odd) {
-      // If a padding dimension is odd, we have one more element on the right
-      // side or the bottom side. This is unsupported in cudnn. Therefore,
-      // we pad that extra element and make it compatible.
-      compatible_input_shape = ShapeFromFormat(
-          data_format_, dims.batch_size,
-          dims.spatial_dims[0].input_size + rows_odd,
-          dims.spatial_dims[1].input_size + cols_odd, dims.in_depth);
-    } else {
-      compatible_input_shape = input_shape;
-    }
-
-    CHECK(padding_rows >= 0 && padding_cols >= 0)
-        << "Negative row or col paddings: (" << padding_rows << ", "
-        << padding_cols << ")";
-    perftools::gputools::dnn::BatchDescriptor input_desc;
-    input_desc.set_count(dims.batch_size)
-        .set_height(GetTensorDim(compatible_input_shape, data_format_, 'H'))
-        .set_width(GetTensorDim(compatible_input_shape, data_format_, 'W'))
-        .set_feature_map_count(dims.in_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::BatchDescriptor output_desc;
-    output_desc.set_count(dims.batch_size)
-        .set_height(dims.spatial_dims[0].output_size)
-        .set_width(dims.spatial_dims[1].output_size)
-        .set_feature_map_count(dims.out_depth)
-        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-    perftools::gputools::dnn::FilterDescriptor filter_desc;
-    filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
-        .set_input_filter_width(dims.spatial_dims[1].filter_size)
-        .set_input_feature_map_count(dims.in_depth)
-        .set_output_feature_map_count(dims.out_depth);
-    perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-    conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
-        .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-        .set_zero_padding_height(padding_rows / 2)
-        .set_zero_padding_width(padding_cols / 2);
-
-    // NOTE(keveman):
-    // cuDNN only supports the following layouts :
-    // Input  : B x D x R x C
-    // Filter : OD x ID x R x C
-    // Whereas, we have
-    // Input  : B x R x C x D
-    // Filter : R x C x ID x OD
-    // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
-    // The first TransformDepth performs
-    // (B x R x C x D) => (B x D x R x C).
-    // Since the tensor returned from cuDNN is B x D x R x C also,
-    // the second TransformDepth performs
-    // (B x D x R x C) => (B x R x C x D).
-    Tensor transformed_filter;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<T>::value,
-                                TensorShape({dims.out_depth, dims.in_depth,
-                                             dims.spatial_dims[0].filter_size,
-                                             dims.spatial_dims[1].filter_size}),
-                                &transformed_filter));
-
-    functor::TransformFilter<Device, T, int, 4>()(
-        context->eigen_device<Device>(), To32Bit(filter.tensor<T, 4>()),
-        To32Bit(transformed_filter.tensor<T, 4>()));
-
-    Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC) {
-      TensorShape nchw_shape = ShapeFromFormat(
-          FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size,
-          dims.spatial_dims[1].output_size, dims.out_depth);
-      if (dims.out_depth > 1) {
-        OP_REQUIRES_OK(context, context->allocate_temp(
-                                    DataTypeToEnum<T>::value, nchw_shape,
-                                    &transformed_out_backprop));
-        functor::NHWCToNCHW<Device, T, 4>()(
-            context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
-            transformed_out_backprop.tensor<T, 4>());
-      } else {
-        // If depth <= 1, then just reshape.
-        CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
-      }
-    } else {
-      transformed_out_backprop = out_backprop;
-    }
-
-    Tensor pre_transformed_in_backprop;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(
-            DataTypeToEnum<T>::value,
-            ShapeFromFormat(
-                FORMAT_NCHW,
-                GetTensorDim(compatible_input_shape, data_format_, 'N'),
-                GetTensorDim(compatible_input_shape, data_format_, 'H'),
-                GetTensorDim(compatible_input_shape, data_format_, 'W'),
-                GetTensorDim(compatible_input_shape, data_format_, 'C')),
-            &pre_transformed_in_backprop));
-
-    auto out_backprop_ptr =
-        AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
-                       transformed_out_backprop.template flat<T>().size());
-    auto filter_ptr =
-        AsDeviceMemory(transformed_filter.template flat<T>().data(),
-                       transformed_filter.template flat<T>().size());
-    auto in_backprop_ptr =
-        AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
-                       pre_transformed_in_backprop.template flat<T>().size());
-
-    static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit(
-        "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
-        );
-    CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                            context);
-    int device_id = stream->parent()->device_ordinal();
-    DataType dtype = out_backprop.dtype();
-    ConvParameters conv_parameters = {
-        dims.batch_size,                       // batch
-        dims.in_depth,                         // in_depths
-        {{input_desc.height(),                 // in_rows
-          input_desc.width()}},                // in_cols
-        dims.out_depth,                        // out_depths
-        {{dims.spatial_dims[0].filter_size,    // filter_rows
-          dims.spatial_dims[1].filter_size}},  // filter_cols
-        {{dims.spatial_dims[0].stride,         // stride_rows
-          dims.spatial_dims[1].stride}},       // stride_cols
-        {{padding_rows,                        // padding_rows
-          padding_cols}},                      // padding_cols
-        dtype,                                 // tensor data type
-        device_id,                             // device_id
-    };
-    AlgorithmConfig algorithm_config;
-    if (cudnn_use_autotune_ && !AutoTuneConvBwdData::GetInstance()->Find(
-                                   conv_parameters, &algorithm_config)) {
-      std::vector<AlgorithmType> algorithms;
-      CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
-          conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
-      ProfileResult best_result;
-      ProfileResult best_result_no_scratch;
-      for (auto profile_algorithm : algorithms) {
-        // TODO(zhengxq): profile each algorithm multiple times to better
-        // accuracy.
-        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                                context);
-        ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardDataWithAlgorithm(
-                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm), &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
-          if (profile_result.is_valid()) {
-            if (profile_result.elapsed_time_in_ms() <
-                best_result.elapsed_time_in_ms()) {
-              best_result = profile_result;
-            }
-            if (scratch_allocator.TotalByteSize() == 0 &&
-                profile_result.elapsed_time_in_ms() <
-                    best_result_no_scratch.elapsed_time_in_ms()) {
-              best_result_no_scratch = profile_result;
-            }
-          }
-        }
-      }
-      OP_REQUIRES(context,
-                  best_result.is_valid() || best_result_no_scratch.is_valid(),
-                  errors::NotFound("No algorithm worked!"));
-      if (best_result.is_valid()) {
-        algorithm_config.set_algorithm(best_result.algorithm());
-      }
-      if (best_result_no_scratch.is_valid()) {
-        algorithm_config.set_algorithm_no_scratch(
-            best_result_no_scratch.algorithm());
-      }
-      AutoTuneConvBwdData::GetInstance()->Insert(conv_parameters,
-                                                 algorithm_config);
-    }
-    bool cudnn_launch_status =
-        stream
-            ->ThenConvolveBackwardDataWithAlgorithm(
-                filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                algorithm_config, nullptr)
-            .ok();
-
-    if (!cudnn_launch_status) {
-      context->SetStatus(errors::Internal(
-          "cuDNN Backward Data function launch failure : input shape(",
-          input_shape.DebugString(), ") filter shape(",
-          filter_shape.DebugString(), ")"));
-      return;
-    }
-
-    if (rows_odd || cols_odd) {
-      Tensor in_backprop_remove_padding;
-      OP_REQUIRES_OK(
-          context,
-          context->allocate_temp(
-              DataTypeToEnum<T>::value,
-              ShapeFromFormat(FORMAT_NCHW,
-                              GetTensorDim(input_shape, data_format_, 'N'),
-                              GetTensorDim(input_shape, data_format_, 'H'),
-                              GetTensorDim(input_shape, data_format_, 'W'),
-                              GetTensorDim(input_shape, data_format_, 'C')),
-              &in_backprop_remove_padding));
-
-      // Remove the padding for odd rows or cols.
-      functor::PadInput<GPUDevice, T, int, 4>()(
-          context->template eigen_device<GPUDevice>(),
-          To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
-                      .tensor<T, 4>()),
-          {{0, 0}}, {{-rows_odd, -cols_odd}},
-          To32Bit(in_backprop_remove_padding.tensor<T, 4>()), FORMAT_NCHW);
-
-      pre_transformed_in_backprop = in_backprop_remove_padding;
-    }
-
-    if (data_format_ == FORMAT_NHWC) {
-      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-      functor::NCHWToNHWC<Device, T, 4>()(
-          context->eigen_device<Device>(),
-          toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
-          in_backprop->tensor<T, 4>());
-    } else {
-      *in_backprop = pre_transformed_in_backprop;
-    }
+    launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
+              stride_rows, stride_cols, padding_, in_backprop, data_format_);
   }
 
  private:
@@ -956,11 +617,354 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
   Padding padding_;
   bool use_cudnn_;
   TensorFormat data_format_;
+  LaunchConv2DBackpropInputOp<Device, T> launcher_;
   bool cudnn_use_autotune_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropInputOp);
 };
 
+template <typename T>
+void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
+    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+    const Tensor& out_backprop, const Tensor& filter, int row_stride,
+    int col_stride, const Padding& padding, Tensor* in_backprop,
+    TensorFormat data_format) {
+  using perftools::gputools::dnn::AlgorithmConfig;
+  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::ProfileResult;
+
+  std::vector<int32> strides(4, 1);
+  strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
+  strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
+  TensorShape input_shape = in_backprop->shape();
+
+  const TensorShape& filter_shape = filter.shape();
+  ConvBackpropDimensions dims;
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+                          "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
+                          input_shape, filter_shape, out_backprop.shape(),
+                          strides, padding, data_format, &dims));
+
+  const int padding_rows =
+      (padding == VALID)
+          ? 0
+          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
+                                     dims.spatial_dims[0].stride +
+                                 dims.spatial_dims[0].filter_size -
+                                 dims.spatial_dims[0].input_size);
+  const int padding_cols =
+      (padding == VALID)
+          ? 0
+          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
+                                     dims.spatial_dims[1].stride +
+                                 dims.spatial_dims[1].filter_size -
+                                 dims.spatial_dims[1].input_size);
+
+  // TODO(keveman): cuDNN only supports equal padding on both sides, so only
+  // calling it when that is true. Remove this check when (if?) cuDNN starts
+  // supporting different padding.
+  bool rows_odd = (padding_rows % 2 != 0);
+  bool cols_odd = (padding_cols % 2 != 0);
+
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+  if (!use_cudnn) {
+    ctx->SetStatus(errors::Unimplemented(
+        "Conv2DBackpropInput for GPU is not currently supported "
+        "without cudnn"));
+    return;
+  }
+
+  if (dims.spatial_dims[0].filter_size == 1 &&
+      dims.spatial_dims[1].filter_size == 1 &&
+      dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
+      data_format == FORMAT_NHWC) {
+    // 1x1 filter, so call cublas directly.
+    const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
+                     dims.spatial_dims[1].input_size;
+    const uint64 k = dims.out_depth;
+    const uint64 n = dims.in_depth;
+
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
+                                in_backprop->template flat<T>().size());
+
+    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
+    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+
+    bool blas_launch_status =
+        stream
+            ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k,
+                           a_ptr, k, 0.0f, &c_ptr, n)
+            .ok();
+    if (!blas_launch_status) {
+      ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                      ", n=", n, ", k=", k));
+    }
+    return;
+  } else if (dims.spatial_dims[0].filter_size ==
+                 dims.spatial_dims[0].input_size &&
+             dims.spatial_dims[1].filter_size ==
+                 dims.spatial_dims[1].input_size &&
+             padding == VALID && data_format == FORMAT_NHWC) {
+    // The input data and filter have the same height/width, so call cublas
+    // directly.
+    const uint64 m = dims.batch_size;
+    const uint64 k = dims.out_depth;
+    const uint64 n = dims.spatial_dims[0].input_size *
+                     dims.spatial_dims[1].input_size * dims.in_depth;
+
+    auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
+                                out_backprop.template flat<T>().size());
+    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                filter.template flat<T>().size());
+    auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
+                                in_backprop->template flat<T>().size());
+
+    auto transpose = perftools::gputools::blas::Transpose::kTranspose;
+    auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+
+    bool blas_launch_status =
+        stream
+            ->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, k,
+                           a_ptr, k, 0.0f, &c_ptr, n)
+            .ok();
+    if (!blas_launch_status) {
+      ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                      ", n=", n, ", k=", k));
+    }
+    return;
+  }
+
+  TensorShape compatible_input_shape;
+  if (rows_odd || cols_odd) {
+    // If a padding dimension is odd, we have one more element on the right
+    // side or the bottom side. This is unsupported in cudnn. Therefore,
+    // we pad that extra element and make it compatible.
+    compatible_input_shape = ShapeFromFormat(
+        data_format, dims.batch_size,
+        dims.spatial_dims[0].input_size + rows_odd,
+        dims.spatial_dims[1].input_size + cols_odd, dims.in_depth);
+  } else {
+    compatible_input_shape = input_shape;
+  }
+
+  CHECK(padding_rows >= 0 && padding_cols >= 0)
+      << "Negative row or col paddings: (" << padding_rows << ", "
+      << padding_cols << ")";
+  perftools::gputools::dnn::BatchDescriptor input_desc;
+  input_desc.set_count(dims.batch_size)
+      .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
+      .set_width(GetTensorDim(compatible_input_shape, data_format, 'W'))
+      .set_feature_map_count(dims.in_depth)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::BatchDescriptor output_desc;
+  output_desc.set_count(dims.batch_size)
+      .set_height(dims.spatial_dims[0].output_size)
+      .set_width(dims.spatial_dims[1].output_size)
+      .set_feature_map_count(dims.out_depth)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::FilterDescriptor filter_desc;
+  filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
+      .set_input_filter_width(dims.spatial_dims[1].filter_size)
+      .set_input_feature_map_count(dims.in_depth)
+      .set_output_feature_map_count(dims.out_depth);
+  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+      .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
+      .set_zero_padding_height(padding_rows / 2)
+      .set_zero_padding_width(padding_cols / 2);
+
+  // NOTE(keveman):
+  // cuDNN only supports the following layouts :
+  // Input  : B x D x R x C
+  // Filter : OD x ID x R x C
+  // Whereas, we have
+  // Input  : B x R x C x D
+  // Filter : R x C x ID x OD
+  // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
+  // The first TransformDepth performs
+  // (B x R x C x D) => (B x D x R x C).
+  // Since the tensor returned from cuDNN is B x D x R x C also,
+  // the second TransformDepth performs
+  // (B x D x R x C) => (B x R x C x D).
+  Tensor transformed_filter;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                              TensorShape({dims.out_depth, dims.in_depth,
+                                           dims.spatial_dims[0].filter_size,
+                                           dims.spatial_dims[1].filter_size}),
+                              &transformed_filter));
+
+  functor::TransformFilter<GPUDevice, T, int, 4>()(
+      ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()),
+      To32Bit(transformed_filter.tensor<T, 4>()));
+
+  Tensor transformed_out_backprop;
+  if (data_format == FORMAT_NHWC) {
+    TensorShape nchw_shape = ShapeFromFormat(
+        FORMAT_NCHW, dims.batch_size, dims.spatial_dims[0].output_size,
+        dims.spatial_dims[1].output_size, dims.out_depth);
+    if (dims.out_depth > 1) {
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                        &transformed_out_backprop));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(), out_backprop.tensor<T, 4>(),
+          transformed_out_backprop.tensor<T, 4>());
+    } else {
+      // If depth <= 1, then just reshape.
+      CHECK(transformed_out_backprop.CopyFrom(out_backprop, nchw_shape));
+    }
+  } else {
+    transformed_out_backprop = out_backprop;
+  }
+
+  Tensor pre_transformed_in_backprop;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DataTypeToEnum<T>::value,
+               ShapeFromFormat(
+                   FORMAT_NCHW,
+                   GetTensorDim(compatible_input_shape, data_format, 'N'),
+                   GetTensorDim(compatible_input_shape, data_format, 'H'),
+                   GetTensorDim(compatible_input_shape, data_format, 'W'),
+                   GetTensorDim(compatible_input_shape, data_format, 'C')),
+               &pre_transformed_in_backprop));
+
+  auto out_backprop_ptr =
+      AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
+                     transformed_out_backprop.template flat<T>().size());
+  auto filter_ptr =
+      AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                     transformed_filter.template flat<T>().size());
+  auto in_backprop_ptr =
+      AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
+                     pre_transformed_in_backprop.template flat<T>().size());
+
+  static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit(
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
+  );
+  CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
+  int device_id = stream->parent()->device_ordinal();
+  DataType dtype = out_backprop.dtype();
+  ConvParameters conv_parameters = {
+      dims.batch_size,                       // batch
+      dims.in_depth,                         // in_depths
+      {{input_desc.height(),                 // in_rows
+        input_desc.width()}},                // in_cols
+      dims.out_depth,                        // out_depths
+      {{dims.spatial_dims[0].filter_size,    // filter_rows
+        dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].stride,         // stride_rows
+        dims.spatial_dims[1].stride}},       // stride_cols
+      {{padding_rows,                        // padding_rows
+        padding_cols}},                      // padding_cols
+      dtype,                                 // tensor data type
+      device_id,                             // device_id
+  };
+  AlgorithmConfig algorithm_config;
+  if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
+                                conv_parameters, &algorithm_config)) {
+    std::vector<AlgorithmType> algorithms;
+    CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+    ProfileResult best_result;
+    ProfileResult best_result_no_scratch;
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                              ctx);
+      ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenConvolveBackwardDataWithAlgorithm(
+                  filter_desc, filter_ptr, output_desc, out_backprop_ptr,
+                  conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
+                  AlgorithmConfig(profile_algorithm), &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
+          }
+        }
+      }
+    }
+    OP_REQUIRES(ctx,
+                best_result.is_valid() || best_result_no_scratch.is_valid(),
+                errors::NotFound("No algorithm worked!"));
+    if (best_result.is_valid()) {
+      algorithm_config.set_algorithm(best_result.algorithm());
+    }
+    if (best_result_no_scratch.is_valid()) {
+      algorithm_config.set_algorithm_no_scratch(
+          best_result_no_scratch.algorithm());
+    }
+    AutoTuneConvBwdData::GetInstance()->Insert(conv_parameters,
+                                               algorithm_config);
+  }
+  bool cudnn_launch_status =
+      stream
+          ->ThenConvolveBackwardDataWithAlgorithm(
+              filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
+              input_desc, &in_backprop_ptr, &scratch_allocator,
+              algorithm_config, nullptr)
+          .ok();
+
+  if (!cudnn_launch_status) {
+    ctx->SetStatus(errors::Internal(
+        "cuDNN Backward Data function launch failure : input shape(",
+        input_shape.DebugString(), ") filter shape(",
+        filter_shape.DebugString(), ")"));
+    return;
+  }
+
+  if (rows_odd || cols_odd) {
+    Tensor in_backprop_remove_padding;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(
+                 DataTypeToEnum<T>::value,
+                 ShapeFromFormat(FORMAT_NCHW,
+                                 GetTensorDim(input_shape, data_format, 'N'),
+                                 GetTensorDim(input_shape, data_format, 'H'),
+                                 GetTensorDim(input_shape, data_format, 'W'),
+                                 GetTensorDim(input_shape, data_format, 'C')),
+                 &in_backprop_remove_padding));
+
+    // Remove the padding for odd rows or cols.
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        ctx->template eigen_device<GPUDevice>(),
+        To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
+                    .tensor<T, 4>()),
+        {{0, 0}}, {{-rows_odd, -cols_odd}},
+        To32Bit(in_backprop_remove_padding.tensor<T, 4>()), FORMAT_NCHW);
+
+    pre_transformed_in_backprop = in_backprop_remove_padding;
+  }
+
+  if (data_format == FORMAT_NHWC) {
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+        ctx->eigen_device<GPUDevice>(),
+        toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
+        in_backprop->tensor<T, 4>());
+  } else {
+    *in_backprop = pre_transformed_in_backprop;
+  }
+}
+
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                              \
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index 3ea9510afba..2926bb3a867 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -168,6 +168,43 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Forward declaration.
+class OpKernelContext;
+
+template <typename Device, typename T>
+struct LaunchConv2DBackpropInputOp {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& filter,
+                  int row_stride, int col_stride, const Padding& padding,
+                  Tensor* in_backprop, TensorFormat data_format);
+};
+
+template <typename Device, typename T>
+struct LaunchConv2DBackpropFilterOp {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& input,
+                  int row_stride, int col_stride, const Padding& padding,
+                  Tensor* filter_backprop, TensorFormat data_format);
+};
+
+#ifdef GOOGLE_CUDA
+template <typename T>
+struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* output,
+                  TensorFormat data_format);
+};
+
+template <typename T>
+struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& out_backprop, const Tensor& input,
+                  int row_stride, int col_stride, const Padding& padding,
+                  Tensor* filter_backprop, TensorFormat data_format);
+};
+#endif  // GOOGLE_CUDA
+
 // Information about a single spatial dimension for a convolution
 // backpropagation.
 struct ConvBackpropSpatialDimension {
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 2c77a389527..bbb9e36fc9d 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -58,10 +58,10 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 template <typename Device, typename T>
 struct LaunchGeneric {
-  static void launch(OpKernelContext* ctx, const Tensor& input,
-                     const Tensor& filter, int row_stride, int col_stride,
-                     const Eigen::PaddingType& padding, Tensor* output,
-                     TensorFormat data_format) {
+  void operator()(OpKernelContext* ctx, const Tensor& input,
+                  const Tensor& filter, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
+                  TensorFormat data_format) {
     CHECK(data_format == FORMAT_NHWC) << "Generic conv implementation only "
                                          "supports NHWC tensor format for now.";
     if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
@@ -86,8 +86,7 @@ struct LaunchGeneric {
           filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
           dim_pair);
     } else if (filter.dim_size(0) == input.dim_size(1) &&
-               filter.dim_size(1) == input.dim_size(2) &&
-               padding == Eigen::PADDING_VALID) {
+               filter.dim_size(1) == input.dim_size(2) && padding == VALID) {
       // If the input data and filter have the same height/width,
       // the 2D convolution is reduced to matrix multiplication.
       const int k =  // Length of reduction dimension.
@@ -104,28 +103,26 @@ struct LaunchGeneric {
       functor::SpatialConvolution<Device, T>()(
           ctx->eigen_device<Device>(), output->tensor<T, 4>(),
           input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
-          padding);
+          BrainPadding2EigenPadding(padding));
     }
   }
 };
 }  // namespace
 
 template <typename T>
-class LaunchConv2DOp<CPUDevice, T> {
- public:
-  void launch(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-              const Tensor& input, const Tensor& filter, int row_stride,
-              int col_stride, const Eigen::PaddingType& padding, Tensor* output,
-              TensorFormat data_format) {
+struct LaunchConv2DOp<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* output,
+                  TensorFormat data_format) {
     if (data_format != FORMAT_NHWC) {
       ctx->SetStatus(
           errors::Unimplemented("Generic conv implementation only supports "
                                 "NHWC tensor format for now."));
       return;
     }
-    LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, row_stride,
-                                        col_stride, padding, output,
-                                        data_format);
+    LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride,
+                                  padding, output, data_format);
   }
 };
 
@@ -387,9 +384,8 @@ class Conv2DOp : public BinaryOp<T> {
       return;
     }
 
-    launcher_.launch(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-                     stride_rows, stride_cols,
-                     BrainPadding2EigenPadding(padding_), output, data_format_);
+    launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
+              stride_rows, stride_cols, padding_, output, data_format_);
   }
 
  private:
@@ -445,10 +441,10 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
     AutoTuneConv;
 
 template <typename T>
-void LaunchConv2DOp<GPUDevice, T>::launch(
+void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& input_param, const Tensor& filter, int row_stride,
-    int col_stride, const Eigen::PaddingType& padding, Tensor* output,
+    int col_stride, const Padding& padding, Tensor* output,
     TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmType;
@@ -492,8 +488,8 @@ void LaunchConv2DOp<GPUDevice, T>::launch(
     }
     return;
   } else if (filter.dim_size(0) == input.dim_size(1) &&
-             filter.dim_size(1) == input.dim_size(2) &&
-             padding == Eigen::PADDING_VALID && data_format == FORMAT_NHWC) {
+             filter.dim_size(1) == input.dim_size(2) && padding == VALID &&
+             data_format == FORMAT_NHWC) {
     // The input data and filter have the same height/width, so call cublas
     // directly.
     const uint64 m = input.dim_size(0);
@@ -533,7 +529,7 @@ void LaunchConv2DOp<GPUDevice, T>::launch(
   const int64 out_depths = GetTensorDim(*output, data_format, 'C');
   const int64 patch_rows = filter.dim_size(0);
   const int64 patch_cols = filter.dim_size(1);
-  if (padding == Eigen::PADDING_SAME) {
+  if (padding == SAME) {
     // Total padding on rows and cols is
     // Pr = (R' - 1) * S + Kr - R
     // Pc = (C' - 1) * S + Kc - C
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 60091fc27fd..e29271dff27 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -32,14 +32,23 @@ namespace tensorflow {
 class OpKernelContext;
 
 template <typename Device, typename T>
-class LaunchConv2DOp {
- public:
-  void launch(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-              const Tensor& input, const Tensor& filter, int row_stride,
-              int col_stride, const Eigen::PaddingType& padding, Tensor* output,
-              TensorFormat data_format);
+struct LaunchConv2DOp {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* output,
+                  TensorFormat data_format);
 };
 
+#ifdef GOOGLE_CUDA
+template <typename T>
+struct LaunchConv2DOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
+                  const Tensor& input, const Tensor& filter, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* output,
+                  TensorFormat data_format);
+};
+#endif  // GOOGLE_CUDA
+
 // Used to keep track of persistent memory buffers used within the op.
 // It uses malloc and free to avoid the time cost of initializing the memory.
 template <class T, size_t size>
@@ -55,17 +64,6 @@ struct Im2ColBufferResource : public ResourceBase {
   string DebugString() { return "Im2ColBufferResource"; }
 };
 
-#ifdef GOOGLE_CUDA
-template <typename T>
-class LaunchConv2DOp<Eigen::GpuDevice, T> {
- public:
-  void launch(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-              const Tensor& input, const Tensor& filter, int row_stride,
-              int col_stride, const Eigen::PaddingType& padding, Tensor* output,
-              TensorFormat data_format);
-};
-#endif  // GOOGLE_CUDA
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONV_OPS_H
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index a1f60019141..fb03adb7a53 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -91,7 +91,14 @@ class CTCLossOp : public OpKernel {
     OP_REQUIRES(ctx, batch_size != 0,
                 errors::InvalidArgument("batch_size must not be 0"));
 
-    TensorShape labels_shape({batch_size, max_time});
+    // Figure out the maximum label length to use as sparse tensor dimension.
+    auto labels_indices_t = labels_indices->matrix<int64>();
+    int64 max_label_len = 0;
+    for (int i = 0; i < labels_indices->dim_size(0); i++) {
+      max_label_len = std::max(max_label_len, labels_indices_t(i, 1) + 1);
+    }
+
+    TensorShape labels_shape({batch_size, max_label_len});
     std::vector<int64> order{0, 1};
     sparse::SparseTensor labels_sp(*labels_indices, *labels_values,
                                    labels_shape, order);
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 3a8ccfe6b74..5c6b5eec829 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -30,10 +30,13 @@
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/cuda.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
 
+using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+
 namespace tensorflow {
 namespace {
 
@@ -148,7 +151,12 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
   // This callback checks that all batch items in all calls were processed
   // successfully and passes status to the info_checker_callback accordingly.
   auto wrapped_info_checker_callback =
-      [info_checker_callback](std::vector<HostLapackInfo> host_lapack_infos) {
+      [](OpKernelContext* context,
+         std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+             info_checker_callback,
+         std::vector<HostLapackInfo> host_lapack_infos) {
+        auto stream = context->op_device_context()->stream();
+        ScopedActivateExecutorContext scoped_activation{stream->parent()};
         Status status;
         for (const auto& host_lapack_info : host_lapack_infos) {
           for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) {
@@ -166,8 +174,10 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
         }
         info_checker_callback(status, host_lapack_infos);
       };
+
   auto cb =
-      std::bind(wrapped_info_checker_callback, std::move(host_lapack_infos));
+      std::bind(wrapped_info_checker_callback, context_,
+                std::move(info_checker_callback), std::move(host_lapack_infos));
   auto stream = context_->op_device_context()->stream();
   context_->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
       stream, std::move(cb));
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index eb173c7040d..6adaecba04b 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -18,7 +18,10 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32,
           int64, complex64, complex128);
-#if defined(__ANDROID_TYPES_SLIM__)
+#if !defined(__ANDROID_TYPES_SLIM__)
+// Sub op for int8, uint8, int16, uint16
+REGISTER4(BinaryOp, CPU, "Sub", functor::sub, int8, uint8, int16, uint16);
+#else
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
 // int32 version of this op is needed, so explicitly include it.
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 89bcbc9c373..37c94865942 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -573,7 +573,8 @@ TEST_F(DebugNumericSummaryOpTest, UInt8Success) {
 
 TEST_F(DebugNumericSummaryOpTest, BoolSuccess) {
   TF_ASSERT_OK(Init(DT_BOOL));
-  AddInputFromArray<bool>(TensorShape({2, 3}), {0, 0, 1, 1, 1, 0});
+  AddInputFromArray<bool>(TensorShape({2, 3}),
+                          {false, false, true, true, true, false});
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor expected(allocator(), DT_DOUBLE, TensorShape({16}));
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 00d7f564082..9804d7d38e1 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -361,19 +361,15 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
   }
 }
 
-// Kernels to compute the input backprop for depthwise convolution.
-template <typename Device, typename T>
-struct LaunchDepthwiseConvBackpropInputOp;
-
 // Computes the depthwise conv2d backprop input of 'out_backprop' by
 // 'depthwise_filter' and stores the result in 'in_backprop'.
 template <typename T>
 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
   typedef typename Eigen::internal::packet_traits<T>::type Packet;
 
-  static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
-                     const T* out_backprop, const T* depthwise_filter,
-                     T* in_backprop, TensorFormat data_format) {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* depthwise_filter,
+                  T* in_backprop, TensorFormat data_format) {
     OP_REQUIRES(
         ctx, data_format == FORMAT_NHWC,
         errors::Unimplemented(
@@ -514,27 +510,8 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
-template <typename T>
-struct DepthwiseConv2dBackpropInputGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* filter, T* in_backprop,
-                  TensorFormat data_format);
-};
-
-template <typename T>
-struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, T> {
-  static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* out_backprop, const T* filter, T* in_backprop,
-                     TensorFormat data_format) {
-    const GPUDevice& d = ctx->eigen_device<GPUDevice>();
-    DepthwiseConv2dBackpropInputGPULaunch<T>().Run(
-        d, args, out_backprop, filter, in_backprop, data_format);
-    auto stream = ctx->op_device_context()->stream();
-    OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
-                                                    "DepthwiseConv2dBackpropInp"
-                                                    "utGPULaunch failed"));
-  }
-};
+extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
+extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
 #endif  // GOOGLE_CUDA
 
@@ -598,7 +575,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     if (input_shape.num_elements() == 0) {
       return;
     }
-    LaunchDepthwiseConvBackpropInputOp<Device, T>::launch(
+    LaunchDepthwiseConvBackpropInputOp<Device, T>()(
         context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
         data_format_);
   }
@@ -744,9 +721,9 @@ template <typename T>
 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
   typedef typename Eigen::internal::packet_traits<T>::type Packet;
 
-  static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
-                     const T* out_backprop, const T* input, T* filter_backprop,
-                     TensorFormat data_format) {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format) {
     OP_REQUIRES(
         ctx, data_format == FORMAT_NHWC,
         errors::Unimplemented(
@@ -907,35 +884,8 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
-template <typename T>
-struct DepthwiseConv2dBackpropFilterGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* input, T* filter_backprop,
-                  TensorFormat data_format);
-};
-
-template <typename T>
-struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T> {
-  static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* out_backprop, const T* input, T* filter_backprop,
-                     TensorFormat data_format) {
-    const GPUDevice& d = ctx->eigen_device<GPUDevice>();
-    auto stream = ctx->op_device_context()->stream();
-
-    // Initialize the results to 0.
-    int num_filter_backprop =
-        args.filter_rows * args.filter_cols * args.out_depth;
-    perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop,
-                                                        num_filter_backprop);
-    stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
-
-    DepthwiseConv2dBackpropFilterGPULaunch<T>().Run(
-        d, args, out_backprop, input, filter_backprop, data_format);
-    OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
-                                                    "DepthwiseConv2dBackpropFil"
-                                                    "terGPULaunch failed"));
-  }
-};
+extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
+extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 
 #endif  // GOOGLE_CUDA
 
@@ -1001,7 +951,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     if (filter_shape.num_elements() == 0) {
       return;
     }
-    LaunchDepthwiseConvBackpropFilterOp<Device, T>::launch(
+    LaunchDepthwiseConvBackpropFilterOp<Device, T>()(
         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
         data_format_);
   }
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 3c01546d8d7..bbeeaf78954 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -54,9 +54,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
-struct LaunchDepthwiseConvOp;
-
 // Computes the vectorized product of 'input_buffer' and 'filter' and stores
 // result in 'output' at location specified by 'out_r' and 'out_c'.
 //
@@ -156,9 +153,9 @@ template <typename T>
 struct LaunchDepthwiseConvOp<CPUDevice, T> {
   typedef typename Eigen::internal::packet_traits<T>::type Packet;
 
-  static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
-                     const T* input, const T* depthwise_filter, T* output,
-                     TensorFormat data_format) {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* input, const T* depthwise_filter, T* output,
+                  TensorFormat data_format) {
     OP_REQUIRES(
         ctx, data_format == FORMAT_NHWC,
         errors::Unimplemented(
@@ -248,27 +245,9 @@ extern template class LaunchConv2DOp<CPUDevice, float>;
 
 #if GOOGLE_CUDA
 
-template <typename T>
-struct DepthwiseConv2dGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args, const T* input,
-                  const T* filter, T* output, TensorFormat data_format);
-};
-
-template <typename T>
-struct LaunchDepthwiseConvOp<GPUDevice, T> {
-  static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* input, const T* filter, T* output,
-                     TensorFormat data_format) {
-    const GPUDevice& d = ctx->eigen_device<GPUDevice>();
-    DepthwiseConv2dGPULaunch<T>().Run(d, args, input, filter, output,
-                                      data_format);
-    auto stream = ctx->op_device_context()->stream();
-    OP_REQUIRES(
-        ctx, stream->ok(),
-        errors::Internal(
-            "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
-  }
-};
+// Extern template instantiated in depthwise_conv_op_gpu.cc.
+extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
+extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
 // Extern template instantiated in conv_ops.cc.
 extern template class LaunchConv2DOp<GPUDevice, float>;
@@ -393,9 +372,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // If in_depth==1, this operation is just a standard convolution, so
     // invoke that op.
     if (std::is_same<T, float>::value && in_depth == 1) {
-      launcher_.launch(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-                       stride_, stride_, BrainPadding2EigenPadding(padding_),
-                       output, data_format_);
+      launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
+                stride_, stride_, padding_, output, data_format_);
       return;
     }
 
@@ -417,8 +395,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     auto input_ptr = input.template flat<T>().data();
     auto filter_ptr = filter.template flat<T>().data();
     auto output_ptr = output->template flat<T>().data();
-    LaunchDepthwiseConvOp<Device, T>::launch(
-        context, args, input_ptr, filter_ptr, output_ptr, data_format_);
+    LaunchDepthwiseConvOp<Device, T>()(context, args, input_ptr, filter_ptr,
+                                       output_ptr, data_format_);
   }
 
  private:
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index 1960b02bbea..aa5b5c76f6a 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -56,6 +56,53 @@ struct DepthwiseArgs {
         out_depth(0) {}
 };
 
+// Forward declaration.
+class OpKernelContext;
+
+template <typename Device, typename T>
+struct LaunchDepthwiseConvOp {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* input, const T* filter, T* output,
+                  TensorFormat data_format);
+};
+
+template <typename Device, typename T>
+struct LaunchDepthwiseConvBackpropInputOp {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* filter, T* in_backprop,
+                  TensorFormat data_format);
+};
+
+template <typename Device, typename T>
+struct LaunchDepthwiseConvBackpropFilterOp {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format);
+};
+
+#if GOOGLE_CUDA
+template <typename T>
+struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs args,
+                  const T* input, const T* filter, T* output,
+                  TensorFormat data_format);
+};
+
+template <typename T>
+struct LaunchDepthwiseConvBackpropInputOp<Eigen::GpuDevice, T> {
+  void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* filter, T* in_backprop,
+                  TensorFormat data_format);
+};
+
+template <typename T>
+struct LaunchDepthwiseConvBackpropFilterOp<Eigen::GpuDevice, T> {
+  void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format);
+};
+#endif
+
 }  // namespace tensorflow
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index f63a99a7308..fcfcd188d2d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
@@ -689,21 +690,27 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-struct DepthwiseConv2dGPULaunch {
-  static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input,
-                  const T* filter, T* output, TensorFormat data_format) {
-    if (args.filter_rows == 3 && args.filter_cols == 3) {
-      LaunchDepthwiseConv2dGPU<T, 3, 3>(d, args, input, filter, output,
+void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
+                                                     const DepthwiseArgs args,
+                                                     const T* input,
+                                                     const T* filter, T* output,
+                                                     TensorFormat data_format) {
+  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  if (args.filter_rows == 3 && args.filter_cols == 3) {
+    LaunchDepthwiseConv2dGPU<T, 3, 3>(d, args, input, filter, output,
+                                      data_format);
+  } else {
+    LaunchDepthwiseConv2dGPU<T, -1, -1>(d, args, input, filter, output,
                                         data_format);
-    } else {
-      LaunchDepthwiseConv2dGPU<T, -1, -1>(d, args, input, filter, output,
-                                          data_format);
-    }
   }
-};
+  auto stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream->ok(),
+              errors::Internal(
+                  "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
+}
 
-template struct DepthwiseConv2dGPULaunch<float>;
-template struct DepthwiseConv2dGPULaunch<double>;
+template struct LaunchDepthwiseConvOp<GPUDevice, float>;
+template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -893,22 +900,26 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-struct DepthwiseConv2dBackpropInputGPULaunch {
-  static void Run(const GpuDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* filter, T* in_backprop,
-                  TensorFormat data_format) {
-    if (args.filter_rows == 3 && args.filter_cols == 3) {
-      LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>(
-          d, args, out_backprop, filter, in_backprop, data_format);
-    } else {
-      LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1>(
-          d, args, out_backprop, filter, in_backprop, data_format);
-    }
+void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
+    const T* filter, T* in_backprop, TensorFormat data_format) {
+  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  if (args.filter_rows == 3 && args.filter_cols == 3) {
+    LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>(
+        d, args, out_backprop, filter, in_backprop, data_format);
+  } else {
+    LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1>(
+        d, args, out_backprop, filter, in_backprop, data_format);
   }
-};
+  auto stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream->ok(),
+              errors::Internal("Launch of gpu kernel for "
+                               "DepthwiseConv2dBackpropInp"
+                               "utGPULaunch failed"));
+}
 
-template struct DepthwiseConv2dBackpropInputGPULaunch<float>;
-template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
+template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
+template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -1580,21 +1591,33 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-struct DepthwiseConv2dBackpropFilterGPULaunch {
-  static void Run(const GpuDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* input, T* filter_backprop,
-                  TensorFormat data_format) {
-    if (args.filter_rows == 3 && args.filter_cols == 3) {
-      LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3>(
-          d, args, out_backprop, input, filter_backprop, data_format);
-    } else {
-      LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1>(
-          d, args, out_backprop, input, filter_backprop, data_format);
-    }
-  }
-};
+void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
+    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
+    const T* input, T* filter_backprop, TensorFormat data_format) {
+  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  auto stream = ctx->op_device_context()->stream();
 
-template struct DepthwiseConv2dBackpropFilterGPULaunch<float>;
-template struct DepthwiseConv2dBackpropFilterGPULaunch<double>;
+  // Initialize the results to 0.
+  int num_filter_backprop =
+      args.filter_rows * args.filter_cols * args.out_depth;
+  perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop,
+                                                      num_filter_backprop);
+  stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
+
+  if (args.filter_rows == 3 && args.filter_cols == 3) {
+    LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  } else {
+    LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1>(
+        d, args, out_backprop, input, filter_backprop, data_format);
+  }
+  OP_REQUIRES(ctx, stream->ok(),
+              errors::Internal("Launch of gpu kernel for "
+                               "DepthwiseConv2dBackpropFil"
+                               "terGPULaunch failed"));
+}
+
+template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/group_by_window_dataset_op.cc
index a53e9456ad2..a4f9608b1fa 100644
--- a/tensorflow/core/kernels/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/group_by_window_dataset_op.cc
@@ -36,20 +36,14 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         graph_def_version_(ctx->graph_def_version()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size_func", &window_size_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    int64 window_size = 0;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
-    OP_REQUIRES(
-        ctx, window_size > 0,
-        errors::InvalidArgument("Window size must be greater than zero."));
-
-    // Get captured inputs for the key and reduce functions.
+    // Get captured inputs for the key, reduce, and window_size functions.
     OpInputList key_func_other_argument_inputs;
     OP_REQUIRES_OK(ctx, ctx->input_list("key_func_other_arguments",
                                         &key_func_other_argument_inputs));
@@ -67,6 +61,16 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     for (const Tensor& t : reduce_func_other_argument_inputs) {
       reduce_func_other_arguments.push_back(t);
     }
+    OpInputList window_size_func_other_argument_inputs;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("window_size_func_other_arguments",
+                                   &window_size_func_other_argument_inputs));
+    std::vector<Tensor> window_size_func_other_arguments;
+    window_size_func_other_arguments.reserve(
+        window_size_func_other_argument_inputs.size());
+    for (const Tensor& t : window_size_func_other_argument_inputs) {
+      window_size_func_other_arguments.push_back(t);
+    }
     // TODO(mrry): Refactor CapturedFunction to share the runtime
     // state between multiple functions?
     std::unique_ptr<CapturedFunction> captured_key_func;
@@ -79,24 +83,30 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         ctx, CapturedFunction::Create(ctx, reduce_func_, graph_def_version_,
                                       std::move(reduce_func_other_arguments),
                                       &captured_reduce_func));
+    std::unique_ptr<CapturedFunction> captured_window_size_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            ctx, window_size_func_, graph_def_version_,
+                            std::move(window_size_func_other_arguments),
+                            &captured_window_size_func));
 
-    *output = new Dataset(input, window_size, std::move(captured_key_func),
-                          std::move(captured_reduce_func), output_types_,
-                          output_shapes_);
+    *output = new Dataset(
+        input, std::move(captured_key_func), std::move(captured_reduce_func),
+        std::move(captured_window_size_func), output_types_, output_shapes_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
-    Dataset(const DatasetBase* input, int64 window_size,
+    Dataset(const DatasetBase* input,
             std::unique_ptr<CapturedFunction> captured_key_func,
             std::unique_ptr<CapturedFunction> captured_reduce_func,
+            std::unique_ptr<CapturedFunction> captured_window_size_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : input_(input),
-          window_size_(window_size),
           captured_key_func_(std::move(captured_key_func)),
           captured_reduce_func_(std::move(captured_reduce_func)),
+          captured_window_size_func_(std::move(captured_window_size_func)),
           output_types_(output_types),
           output_shapes_(output_shapes) {
       input_->Ref();
@@ -182,10 +192,44 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               }
               const int64 key = key_func_output[0].scalar<int64>()();
 
+              if (window_sizes_.find(key) == window_sizes_.end()) {
+                // Run window_size function
+                FunctionLibraryRuntime::Options opts2;
+                opts2.step_id = CapturedFunction::generate_step_id();
+                opts2.runner = ctx->runner();
+                ScopedStepContainer step_container2(
+                    opts2.step_id, [this, ctx](const string& name) {
+                      dataset()
+                          ->captured_window_size_func_->resource_manager()
+                          ->Cleanup(name)
+                          .IgnoreError();
+                    });
+                opts2.step_container = &step_container2;
+
+                // Run the window size function on the key to identify its
+                // window size.
+                std::vector<Tensor> window_size_func_output;
+                TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run(
+                    opts2, key_func_output, &window_size_func_output));
+
+                if (window_size_func_output.size() != 1 ||
+                    window_size_func_output[0].dtype() != DT_INT64 ||
+                    window_size_func_output[0].NumElements() != 1) {
+                  // TODO(mrry): Support non-int64 window sizes.
+                  return errors::InvalidArgument(
+                      "`window_size_func` must return a scalar int64.");
+                }
+                const int64 window_size =
+                    window_size_func_output[0].scalar<int64>()();
+                window_sizes_[key] = window_size;
+              }
+
+              const int64 window_size = window_sizes_[key];
+
               std::vector<std::vector<Tensor>>& group = groups_[key];
               group.push_back(std::move(next_input_element));
 
-              if (group.size() == dataset()->window_size_) {
+              if (group.size() == window_size) {
                 TF_RETURN_IF_ERROR(StartFlushingGroup(ctx, key));
                 break;
               }
@@ -297,6 +341,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       bool end_of_input_ GUARDED_BY(mu_) = false;
       std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
+      std::map<int64, int64> window_sizes_ GUARDED_BY(mu_);
     };
 
     // A resource name for the temporary window dataset that is
@@ -304,9 +349,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     static constexpr const char* kWindowResourceName = "__window_dataset";
 
     const DatasetBase* const input_;
-    const int64 window_size_;
     const std::unique_ptr<CapturedFunction> captured_key_func_;
     const std::unique_ptr<CapturedFunction> captured_reduce_func_;
+    const std::unique_ptr<CapturedFunction> captured_window_size_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
   };
@@ -316,6 +361,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
   const NameAttrList* key_func_;
   const NameAttrList* reduce_func_;
+  const NameAttrList* window_size_func_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index 9875cd027d5..f8ed9351579 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -27,10 +27,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
-class L2LossOp : public OpKernel {
+template <typename T>
+class L2LossOp<CPUDevice, T> : public OpKernel {
  public:
   explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
 
@@ -42,8 +41,9 @@ class L2LossOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
-    functor::L2Loss<Device, T>()(context->eigen_device<Device>(),
-                                 input.flat<T>(), output->scalar<T>());
+    const CPUDevice& d = context->eigen_device<CPUDevice>();
+    output->scalar<T>().device(d) =
+        (input.flat<T>().square() * static_cast<T>(0.5)).sum();
   }
 };
 
@@ -57,33 +57,4 @@ REGISTER_KERNEL(double);
 REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA
-// Forward declarations of the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                    \
-  template <>                                                                  \
-  void L2Loss<GPUDevice, T>::operator()(const GPUDevice& d,                    \
-                                        typename TTypes<T>::ConstTensor input, \
-                                        typename TTypes<T>::Scalar output);    \
-  extern template struct L2Loss<GPUDevice, T>;
-
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(double);
-DECLARE_GPU_SPEC(Eigen::half);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
-// Registration of the GPU implementations.
-#define REGISTER_GPU_KERNEL(T)                                  \
-  REGISTER_KERNEL_BUILDER(                                      \
-      Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      L2LossOp<GPUDevice, T>);
-
-REGISTER_GPU_KERNEL(float);
-REGISTER_GPU_KERNEL(double);
-REGISTER_GPU_KERNEL(Eigen::half);
-#undef REGISTER_GPU_KERNEL
-
-#endif  // GOOGLE_CUDA
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h
index f7204cefdd4..4953aa237cd 100644
--- a/tensorflow/core/kernels/l2loss_op.h
+++ b/tensorflow/core/kernels/l2loss_op.h
@@ -15,25 +15,19 @@ limitations under the License.
 
 #ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_
 #define TENSORFLOW_KERNELS_L2LOSS_OP_H_
-// Functor definition for L2LossOp, must be compilable by nvcc.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
-namespace functor {
 
-// Functor used by L2LossOp to do the computations.
 template <typename Device, typename T>
-struct L2Loss {
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor input,
-                  typename TTypes<T>::Scalar output) {
-    // We flatten the input tensor and reduce on dimension 0, producing
-    // a single number which is Mul(Sum(x^2), 0.5).
-    output.device(d) = (input.square() * static_cast<T>(0.5)).sum();
-  }
+struct L2LossOp : public OpKernel {
+  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) {}
 };
 
-}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_L2LOSS_OP_H_
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
index 420df370865..73b6472254c 100644
--- a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -21,12 +21,55 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h"
+
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
-template struct functor::L2Loss<GPUDevice, float>;
-template struct functor::L2Loss<GPUDevice, double>;
-template struct functor::L2Loss<GPUDevice, Eigen::half>;
+
+// TODO(eriche): can add specialization for half2
+template <typename T>
+struct squareHalf {
+  __host__ __device__ T operator()(const T& x) const {
+    return static_cast<T>(0.5) * x * x;
+  }
+};
+
+template <typename T>
+class L2LossOp<GPUDevice, T> : public OpKernel {
+ public:
+  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // The input tensor can be of any number of dimensions, even though it's
+    // 2D in most typical applications.
+    const Tensor& input = context->input(0);
+    // The output is a single number.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    typedef cub::TransformInputIterator<T, squareHalf<T>, T*> inputIterType;
+    inputIterType input_itr((T*)input.flat<T>().data(), squareHalf<T>());
+    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
+
+    Constants<GPUDevice> constants;
+    functor::ReduceImpl<T, cub::Sum, T*, inputIterType, ReductionAxes>(
+        context, (T*)output->flat<T>().data(), input_itr, 1,
+        input.flat<T>().size(), 1, 1, 0, constants.kZero, cub::Sum(), T(0));
+  }
+};
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      L2LossOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(Eigen::half);
+#undef REGISTER_GPU_KERNEL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index ef7338e0e0d..50700c8bc8a 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -97,8 +97,12 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
                   errors::InvalidArgument(
                       "Conv2DCustomBackpropInput: size must be 4-dim"));
 
-      MklSizesToTFSizes(context, data_format, mkl_context.filter_shape,
-                        &filter_shape);
+      const int64* filter_sizes =
+        (const int64*) mkl_context.filter_shape.GetSizes();
+      const int64 filter_dims = mkl_context.filter_shape.GetDimension();
+
+      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(filter_sizes,
+        filter_dims, &filter_shape));
     } else {
       filter_shape = filter.shape();
     }
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 203e6946314..b50a6343ba9 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -265,6 +265,28 @@ class MklConv2DOp : public OpKernel {
         sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
                               mkl_output_mkl_shape);
+    // Filter output to be used in the backprop_input
+    TensorShape mkl_filter_output_tf_shape;
+    MklShape mkl_filter_output_mkl_shape;
+    mkl_filter_output_mkl_shape.SetMklTensor(true);
+    mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd,
+        dnnResourceFilter);
+
+    size_t filter_sizes[4] = {filter.dim_size(0), filter.dim_size(1),
+        filter.dim_size(2), filter.dim_size(3)};
+    mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes,
+    mkl_context.filter_strides);
+
+    mkl_filter_output_mkl_shape.SetTfDimOrder(mkl_context.filter_dims,
+        data_format_);
+    mkl_filter_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(
+                mkl_filter_output_mkl_shape.GetMklLayout())) /
+                    sizeof(T));
+    AllocateOutputSetMklShape(context, 1, &mkl_context.output_filter,
+        mkl_filter_output_tf_shape, mkl_filter_output_mkl_shape);
+
     mkl_context.conv_res[dnnResourceDst] =
         static_cast<void*>(output->flat<T>().data());
 
@@ -303,6 +325,7 @@ class MklConv2DOp : public OpKernel {
     dnnPrimitive_t prim_fwd;
     void* conv_res[dnnResourceNumber];
     dnnLayout_t lt_filter, lt_bias, lt_input;
+    Tensor* output_filter = nullptr;
 
     // Create MKL dnnLayout_t objects for tensors coming into the layer
     void MklCreateInputLayouts(OpKernelContext* context) {
@@ -383,8 +406,8 @@ class MklConv2DOp : public OpKernel {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, lt_filter,
                                          mkl_lt_internal_filter),
                  E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
-                       mkl_lt_internal_filter, &mkl_buf_convert_filter);
+        mkl_buf_convert_filter = const_cast<void*>(static_cast<const void*>(
+            output_filter->flat<T>().data()));
         CHECK_EQ(
             dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter,
                                      mkl_buf_convert_filter),
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index b3763f17bc1..03c3fb09a1d 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -43,30 +43,26 @@ class MklReshapeOp : public OpKernel {
     OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
                 errors::InvalidArgument("sizes input must be 1-D, not shape ",
                                         sizes.shape().DebugString()));
-    const int64 num_dims = sizes.NumElements();
 
     // Compute the output shape.  Determine product of specified
     // dimensions, and find the index of the unspecified one.
     TensorShape shape;
     int64 product = 1;
     int unknown_index = -1;
-    auto vec_size = sizes.flat<int32>();
-    for (int d = 0; d < num_dims; ++d) {
-      const int32 size = vec_size(d);
-      if (size == -1) {
-        OP_REQUIRES(
-            context, unknown_index == -1,
-            errors::InvalidArgument("only one input size may be -1, not both ",
-                                    unknown_index, " and ", d));
-        unknown_index = d;
-        shape.AddDim(1);
-      } else {
-        OP_REQUIRES(context, size >= 0,
-                    errors::InvalidArgument(
-                        "size ", d, " must be non-negative, not ", size));
-        shape.AddDim(size);
-        product *= size;
-      }
+    switch (sizes.dtype()) {
+      case DT_INT32:
+        OP_REQUIRES_OK(context, ValidateSizes<int32>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      case DT_INT64:
+        OP_REQUIRES_OK(context, ValidateSizes<int64>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
+            DataTypeString(sizes.dtype())));
+        return;
     }
     if (unknown_index != -1) {
       OP_REQUIRES(
@@ -132,6 +128,34 @@ class MklReshapeOp : public OpKernel {
       CopyTfTensorInToOutWithShape(context, 0, 0, shape);
     }
   }
+ private:
+  template <typename Tshape>
+  Status ValidateSizes(const Tensor& sizes, int64* product, int* unknown_index,
+                       TensorShape* shape) {
+    *product = 1;
+    *unknown_index = -1;
+    const int64 num_dims = sizes.NumElements();
+    auto Svec = sizes.flat<Tshape>();
+    for (int d = 0; d < num_dims; ++d) {
+      const Tshape size = Svec(d);
+      if (size == -1) {
+        if (*unknown_index != -1) {
+          return errors::InvalidArgument(
+              "Only one input size may be -1, not both ", *unknown_index,
+              " and ", d);
+        }
+        *unknown_index = d;
+        shape->AddDim(1);
+      } else if (size < 0) {
+        return errors::InvalidArgument("Size ", d,
+                                       " must be non-negative, not ", size);
+      } else {
+        shape->AddDim(size);
+        (*product) *= size;
+      }
+    }
+    return Status::OK();
+  }
 };
 
 #define REGISTER_MKL_CPU(T)                                         \
@@ -141,6 +165,13 @@ class MklReshapeOp : public OpKernel {
                               .TypeConstraint<T>("T")               \
                               .TypeConstraint<int32>("Tshape")      \
                               .Label(mkl_op_registry::kMklOpLabel), \
+                          MklReshapeOp<CPUDevice, T>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklReshape")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .HostMemory("shape")                  \
+                              .TypeConstraint<T>("T")               \
+                              .TypeConstraint<int64>("Tshape")      \
+                              .Label(mkl_op_registry::kMklOpLabel), \
                           MklReshapeOp<CPUDevice, T>);
 TF_CALL_float(REGISTER_MKL_CPU);
 #undef REGISTER_MKL_CPU
diff --git a/tensorflow/core/kernels/ops_util.cc b/tensorflow/core/kernels/ops_util.cc
index 130939263be..efacd05dd39 100644
--- a/tensorflow/core/kernels/ops_util.cc
+++ b/tensorflow/core/kernels/ops_util.cc
@@ -37,11 +37,6 @@ Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) {
 Status GetBroadcastSize(const int index, const int in_size, const int ksize,
                         const int stride, const int pad_size, int* bindex,
                         int* bsize) {
-  // Cannot have strides larger than the patch size.
-  if (stride > ksize) {
-    return errors::InvalidArgument(
-        "stride must be less than or equal to kernel size");
-  }
   // Cannot have index beyond the input size.
   if (index * stride > in_size) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 42ffef6735b..9d53882deef 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -173,12 +173,6 @@ TEST_F(OpsUtilTest, Get2dOutputSizeVerbose) {
   VerifyGet2dOutputVerboseSizeValues(pad_struct2, error::OK);
 }
 
-// Test stride > ksize fails with INVALID_ARGUMENT.
-TEST_F(OpsUtilTest, GetBroadcastTest3_1_2_0) {
-  bcast_struct bcast = {{0, 3, 1, 2, 0}, {0, 3}};
-  VerifyBoundaries(bcast, error::INVALID_ARGUMENT);
-}
-
 // Test index * stride > in_size fails with INVALID_ARGUMENT.
 TEST_F(OpsUtilTest, GetBroadcastTestBadIndex) {
   bcast_struct bcast = {{2, 3, 1, 2, 0}, {0, 3}};
@@ -281,6 +275,38 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_2) {
   }
 }
 
+// in_size = 3, ksize = 1, stride = 2, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_1_2_0) {
+  bcast_struct bcast[] = {
+      {{0, 3, 1, 2, 0}, {0, 1}},
+      {{1, 3, 1, 2, 0}, {2, 1}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 2, stride = 3, pad_size = 0
+TEST_F(OpsUtilTest, GetBroadcastTest3_2_3_0) {
+  bcast_struct bcast[] = {
+      {{0, 3, 2, 3, 0}, {0, 2}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
+// in_size = 3, ksize = 2, stride = 3, pad_size = 1
+TEST_F(OpsUtilTest, GetBroadcastTest3_2_3_1) {
+  bcast_struct bcast[] = {
+      {{0, 3, 2, 3, 1}, {0, 1}},
+      {{1, 3, 2, 3, 1}, {2, 1}},
+  };
+  for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
+    VerifyBcastValues(bcast[i]);
+  }
+}
+
 TEST_F(OpsUtilTest, SanitizeThreadSuffix) {
   EXPECT_EQ("_aBc123_-___", SanitizeThreadSuffix("/aBc123_-  /"));
 }
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 6e8b09d0500..6196c5ed93e 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -146,9 +146,9 @@ class PadOp : public OpKernel {
                Tensor* output) {
     CHECK_EQ(Dims, paddings.dimension(0));
     CHECK_EQ(2, paddings.dimension(1));
-    Eigen::array<std::pair<int32, int32>, Dims> paddings_array;
+    Eigen::array<Eigen::IndexPair<int32>, Dims> paddings_array;
     for (int i = 0; i < Dims; ++i) {
-      paddings_array[i] = std::make_pair(paddings(i, 0), paddings(i, 1));
+      paddings_array[i] = {paddings(i, 0), paddings(i, 1)};
     }
     functor::Pad<Device, T, Dims> functor;
     functor(context->eigen_device<Device>(), output->tensor<T, Dims>(), input,
@@ -180,7 +180,7 @@ namespace functor {
   void Pad<GPUDevice, T, Dims>::operator()(                               \
       const GPUDevice& d, typename TTypes<T, Dims>::Tensor output,        \
       typename TTypes<T, Dims>::ConstTensor input,                        \
-      Eigen::array<std::pair<int32, int32>, Dims> paddings, T pad_value); \
+      Eigen::array<Eigen::IndexPair<int32>, Dims> paddings, T pad_value); \
   extern template struct Pad<GPUDevice, T, Dims>;
 
 #define DECLARE_GPU_SPECS(T) \
diff --git a/tensorflow/core/kernels/pad_op.h b/tensorflow/core/kernels/pad_op.h
index 6a973833e2d..95a7c9a3ae5 100644
--- a/tensorflow/core/kernels/pad_op.h
+++ b/tensorflow/core/kernels/pad_op.h
@@ -31,7 +31,7 @@ struct Pad {
   // See pad_op.cc for details.
   void operator()(const Device& d, typename TTypes<T, Dims>::Tensor output,
                   typename TTypes<T, Dims>::ConstTensor input,
-                  Eigen::array<std::pair<int32, int32>, Dims> paddings,
+                  Eigen::array<Eigen::IndexPair<int32>, Dims> paddings,
                   T pad_value) {
     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value &&
         (output.size() <= std::numeric_limits<int32>::max())) {
@@ -47,7 +47,7 @@ struct Pad<Device, T, 0> {
   // In the scalar case we simply copy the input.
   void operator()(const Device& d, typename TTypes<T, 0>::Tensor output,
                   typename TTypes<T, 0>::ConstTensor input,
-                  Eigen::array<std::pair<int32, int32>, 0>, T) {
+                  Eigen::array<Eigen::IndexPair<int32>, 0>, T) {
     output.device(d) = input;
   }
 };
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index 79199ff5c3f..dd645262d2e 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -65,4 +66,32 @@ class ParseTensorOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("ParseTensor").Device(DEVICE_CPU), ParseTensorOp);
 
+
+template <typename T>
+class SerializeTensorOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor = context->input(0);
+    TensorProto proto;
+    if (tensor.dtype() == DT_STRING) {
+      tensor.AsProtoField(&proto);
+    } else {
+      tensor.AsProtoTensorContent(&proto);
+    }
+    Tensor* proto_string = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape({}), &proto_string));
+    CHECK(proto.SerializeToString(&proto_string->scalar<string>()()));
+  }
+};
+
+#define REGISTER(T) \
+  REGISTER_KERNEL_BUILDER( \
+      Name("SerializeTensor").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SerializeTensorOp<T>);
+TF_CALL_ALL_TYPES(REGISTER)
+#undef REGISTER
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/parse_tensor_test.cc b/tensorflow/core/kernels/parse_tensor_test.cc
new file mode 100644
index 00000000000..f6f60fee71c
--- /dev/null
+++ b/tensorflow/core/kernels/parse_tensor_test.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+
+namespace tensorflow {
+namespace {
+
+class SerializeTensorOpTest : public OpsTestBase {
+ protected:
+  template <typename T>
+  void MakeOp(const TensorShape& input_shape,
+              std::function<T(int)> functor) {
+    TF_ASSERT_OK(
+        NodeDefBuilder("myop", "SerializeTensor")
+            .Input(FakeInput(DataTypeToEnum<T>::value))
+            .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    AddInput<T>(input_shape, functor);
+  }
+  void ParseSerializedWithNodeDef(const NodeDef& parse_node_def,
+                                    Tensor* serialized,
+                                    Tensor* parse_output) {
+    std::unique_ptr<Device> device(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    inputs.push_back({nullptr, serialized});
+    Status status;
+    std::unique_ptr<OpKernel> op(
+        CreateOpKernel(DEVICE_CPU, device.get(),
+                       cpu_allocator(), parse_node_def,
+                       TF_GRAPH_DEF_VERSION, &status));
+    TF_EXPECT_OK(status);
+    OpKernelContext::Params params;
+    params.device = device.get();
+    params.inputs = &inputs;
+    params.frame_iter = FrameAndIter(0, 0);
+    params.op_kernel = op.get();
+    std::vector<AllocatorAttributes> attrs;
+    test::SetOutputAttrs(&params, &attrs);
+    OpKernelContext ctx(&params);
+    op->Compute(&ctx);
+    TF_EXPECT_OK(status);
+    *parse_output = *ctx.mutable_output(0);
+  }
+  template <typename T>
+  void ParseSerializedOutput(Tensor* serialized, Tensor* parse_output) {
+    NodeDef parse;
+    TF_ASSERT_OK(NodeDefBuilder("parse", "ParseTensor")
+                     .Input(FakeInput(DT_STRING))
+                     .Attr("out_type", DataTypeToEnum<T>::value)
+                     .Finalize(&parse));
+    ParseSerializedWithNodeDef(parse, serialized, parse_output);
+  }
+};
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_half) {
+  MakeOp<Eigen::half>(TensorShape({10}), [](int x) -> Eigen::half {
+      return static_cast<Eigen::half>(x / 10.);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<Eigen::half>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<Eigen::half>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_float) {
+  MakeOp<float>(TensorShape({1, 10}), [](int x) -> float {
+      return static_cast<float>(x / 10.);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<float>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<float>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_double) {
+  MakeOp<double>(TensorShape({5, 5}), [](int x) -> double {
+      return static_cast<double>(x / 10.);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<double>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<double>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int64) {
+  MakeOp<int64>(TensorShape({2, 3, 4}), [](int x) -> int64 {
+      return static_cast<int64>(x - 10);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<int64>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int64>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int32) {
+  MakeOp<int32>(TensorShape({4, 2}), [](int x) -> int32 {
+      return static_cast<int32>(x + 7);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<int32>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int32>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int16) {
+  MakeOp<int16>(TensorShape({8}), [](int x) -> int16 {
+      return static_cast<int16>(x + 18);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<int16>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int16>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int8) {
+  MakeOp<int8>(TensorShape({2}), [](int x) -> int8 {
+      return static_cast<int8>(x + 8);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<int8>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int8>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint16) {
+  MakeOp<uint16>(TensorShape({1, 3}), [](int x) -> uint16 {
+      return static_cast<uint16>(x + 2);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<uint16>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<uint16>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint8) {
+  MakeOp<uint8>(TensorShape({2, 1, 1}), [](int x) -> uint8 {
+      return static_cast<uint8>(x + 1);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<uint8>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<uint8>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_complex64) {
+  MakeOp<complex64>(TensorShape({}), [](int x) -> complex64 {
+      return complex64{ static_cast<float>(x / 8.),
+            static_cast<float>(x / 2.) };
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<complex64>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<complex64>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_complex128) {
+  MakeOp<complex128>(TensorShape({3}), [](int x) -> complex128 {
+      return complex128{ x / 3., x / 2. };
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<complex128>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<complex128>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_bool) {
+  MakeOp<bool>(TensorShape({1}), [](int x) -> bool {
+      return static_cast<bool>(x % 2);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<bool>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<bool>(parse_output, GetInput(0));
+}
+
+TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_string) {
+  MakeOp<std::string>(TensorShape({10}), [](int x) -> std::string {
+      return std::to_string(x / 10.);
+    });
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor parse_output;
+  ParseSerializedOutput<std::string>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<std::string>(parse_output, GetInput(0));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 5db9e6032e0..e43d2828f30 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <iostream>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
@@ -67,7 +68,7 @@ void FillIdentityEigenImpl(const Device& d, OUT_T out, const Reducer& reducer) {
 template <typename Device, typename Reducer>
 struct ReduceFunctor {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const Device& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Reducer& reducer);
 
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 553f8895232..71af9d88dc1 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -190,24 +190,24 @@ class ReductionOp : public OpKernel {
       Functor::FillIdentity(d, tmp_out.flat<T>(), reducer);
     } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
       // Reduce to a scalar.
-      Functor::Reduce(d, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
+      Functor::Reduce(ctx, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
                       constants.kZero, reducer);
     } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a matrix along 1st dimension.
-      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
                       constants.kZero, reducer);
     } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a matrix along 2nd dimension.
-      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
                       constants.kOne, reducer);
     } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
       // dimensions.
-      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
+      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
                       constants.kZeroTwo, reducer);
     } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
-      Functor::Reduce(d, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
+      Functor::Reduce(ctx, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
                       constants.kOne, reducer);
     } else {
       // If we don't hit one of the cases above, transpose the data so that
@@ -223,7 +223,7 @@ class ReductionOp : public OpKernel {
       const int64 unreduced = tmp_out.NumElements();
       const int64 reduced = shuffled.NumElements() / unreduced;
       const Tensor& const_shuffled = shuffled;
-      Functor::Reduce(d, tmp_out.flat<T>(),
+      Functor::Reduce(ctx, tmp_out.flat<T>(),
                       const_shuffled.shaped<T, 2>({unreduced, reduced}),
                       constants.kOne, reducer);
     }
@@ -258,9 +258,10 @@ namespace functor {
 template <typename Device, typename Reducer>
 struct ReduceFunctorBase {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const Device& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Reducer& reducer) {
+    const Device& d = ctx->eigen_device<Device>();
     ReduceEigenImpl(d, out, in, reduction_axes, reducer);
   }
 
diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
index ec4490db83f..8fd9165eb9f 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/kernels/reduction_ops.h"
+#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h"
 
 namespace tensorflow {
 namespace functor {
@@ -33,15 +32,27 @@ typedef TTypes<float>::Tensor::Index Index;
 template <typename Reducer>
 struct ReduceFunctor<GPUDevice, Reducer> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
-                     const Reducer& reducer) {
-    ReduceEigenImpl(d, To32Bit(out), To32Bit(in), reduction_axes, reducer);
+                     const Reducer& reducer);
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::SumReducer<T>& reducer) {
+    ReduceImpl<T, cub::Sum, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Sum(), T(0));
   }
 
   template <typename OUT_T>
   static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Reducer& reducer) {
+                           const Eigen::internal::SumReducer<T>& reducer) {
     FillIdentityEigenImpl(d, To32Bit(out), reducer);
   }
 };
@@ -49,19 +60,30 @@ struct ReduceFunctor<GPUDevice, Reducer> {
 template <typename T>
 struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Eigen::internal::MeanReducer<T>& reducer) {
-    typedef typename IN_T::Index Index;
-    // Eigen sum reductions are much faster on GPU than mean reductions:
-    // Simply trigger them by computing the sum of the weighted inputs.
-    Index num_coeffs_to_reduce = 1;
-    for (int i = 0; i < Eigen::internal::array_size<ReductionAxes>::value;
-         ++i) {
-      num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
-    }
-    T scale = T(1.0 / num_coeffs_to_reduce);
-    out.device(d) = (in * scale).sum(reduction_axes);
+    int divisor = 1;
+    if (out.rank() == 0)
+      divisor = in.size();
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
+      divisor = in.dimension(0);
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2)
+      divisor = in.dimension(0) * in.dimension(2);
+    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+
+    DividesBy<T> div_op(static_cast<T>(divisor));
+    TransformOutputIterator<T, T, DividesBy<T>> itr((T*)out.data(), div_op);
+    ReduceImpl<T, cub::Sum, TransformOutputIterator<T, T, DividesBy<T>>, T*,
+               ReductionAxes>(ctx, itr, (T*)in.data(), in.rank(),
+                              in.dimension(0),
+                              in.rank() >= 2 ? in.dimension(1) : 1,
+                              in.rank() >= 3 ? in.dimension(2) : 1, out.rank(),
+                              reduction_axes, cub::Sum(), T(0));
   }
 
   template <typename OUT_T>
@@ -71,15 +93,159 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   }
 };
 
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<Eigen::half>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
+    float divisor = 1.f;
+    if (out.rank() == 0)
+      divisor = in.size();
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
+      divisor = in.dimension(0);
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2)
+      divisor = in.dimension(0) * in.dimension(2);
+    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+    DividesBy<float, Eigen::half> div_op(divisor);
+
+    typedef cub::TransformInputIterator<float, HalfToFloat, Eigen::half*>
+        inputIterType;
+    inputIterType input_itr((Eigen::half*)in.data(), HalfToFloat());
+
+    typedef TransformOutputIterator<Eigen::half, float,
+                                    DividesBy<float, Eigen::half>>
+        outputIterType;
+    outputIterType itr((Eigen::half*)out.data(), div_op);
+
+    ReduceImpl<float, cub::Sum, outputIterType, inputIterType, ReductionAxes>(
+        ctx, itr, input_itr, in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Sum(), 0.f);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::MaxReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::MaxReducer<T>& reducer) {
+    ReduceImpl<T, cub::Max, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Max(), std::numeric_limits<T>::lowest());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::MaxReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::MinReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::MinReducer<T>& reducer) {
+    ReduceImpl<T, cub::Min, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Min(), std::numeric_limits<T>::max());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::MinReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::ProdReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::ProdReducer<T>& reducer) {
+    ReduceImpl<T, Prod<T>, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Prod<T>(), T(1));
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::ProdReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::AndReducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::AndReducer& reducer) {
+    ReduceImpl<bool, And, bool*, bool*, ReductionAxes>(
+        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, And(),
+        true);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::AndReducer& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::OrReducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::OrReducer& reducer) {
+    ReduceImpl<bool, Or, bool*, bool*, ReductionAxes>(
+        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, Or(),
+        false);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::OrReducer& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
 // T: the data type
 // REDUCER: the reducer functor
 // NUM_AXES: the number of axes to reduce
 // IN_DIMS: the number of dimensions of the input tensor
-#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                        \
-  template void ReduceFunctor<GPUDevice, REDUCER>::Reduce(           \
-      const GPUDevice& d, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
-      TTypes<T, IN_DIMS>::ConstTensor in,                            \
-      const Eigen::array<Index, NUM_AXES>& reduction_axes,           \
+#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                          \
+  template void ReduceFunctor<GPUDevice, REDUCER>::Reduce(             \
+      OpKernelContext* ctx, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
+      TTypes<T, IN_DIMS>::ConstTensor in,                              \
+      const Eigen::array<Index, NUM_AXES>& reduction_axes,             \
       const REDUCER& reducer);
 
 #define DEFINE_IDENTITY(T, REDUCER)                              \
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_kernels.h b/tensorflow/core/kernels/reduction_ops_gpu_kernels.h
new file mode 100644
index 00000000000..ce471c672c7
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_gpu_kernels.h
@@ -0,0 +1,713 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/device/device_reduce.cuh"
+#include "external/cub_archive/cub/device/device_segmented_reduce.cuh"
+#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh"
+#include "external/cub_archive/cub/iterator/transform_input_iterator.cuh"
+#include "external/cub_archive/cub/warp/warp_reduce.cuh"
+#include "cuda/include/cuComplex.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/reduction_ops.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/transform_output_iterator.h"
+
+#include <sstream>
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+// needed to work around a compiler bug in nvcc - it doesn't seem to like
+// the overloaded multiply op for std::complex
+template <>
+struct Prod<std::complex<float>> {
+  __host__ __device__ std::complex<float> operator()(
+      const std::complex<float>& a, const std::complex<float>& b) const {
+    auto result = cuCmulf(make_cuComplex(a.real(), a.imag()),
+                          make_cuComplex(b.real(), b.imag()));
+    return std::complex<float>(result.x, result.y);
+  }
+};
+
+template <>
+struct Prod<std::complex<double>> {
+  __host__ __device__ std::complex<double> operator()(
+      const std::complex<double>& a, const std::complex<double>& b) const {
+    auto result = cuCmul(make_cuDoubleComplex(a.real(), a.imag()),
+                         make_cuDoubleComplex(b.real(), b.imag()));
+    return std::complex<double>(result.x, result.y);
+  }
+};
+
+template <typename T, typename outT = T>
+struct DividesBy {
+  T divisor;
+
+  __host__ __device__ explicit DividesBy(T divisor) : divisor(divisor) {}
+
+  __host__ __device__ outT operator()(const T& x) const { return x / divisor; }
+};
+
+// needed to work around a compiler bug in nvcc - it doesn't seem to like
+// the overloaded ops for std::complex
+template <>
+struct DividesBy<std::complex<float>> {
+  cuFloatComplex divisor;
+
+  __host__ __device__ explicit DividesBy(std::complex<float> divisor)
+      : divisor(make_cuComplex(divisor.real(), divisor.imag())) {}
+
+  // implements
+  __host__ __device__ std::complex<float> operator()(
+      const std::complex<float>& x) const {
+    auto result = cuCdivf(make_cuComplex(x.real(), x.imag()), divisor);
+    return std::complex<float>(result.x, result.y);
+  }
+};
+
+template <>
+struct DividesBy<std::complex<double>> {
+  cuDoubleComplex divisor;
+
+  __host__ __device__ explicit DividesBy(std::complex<double> divisor)
+      : divisor(make_cuDoubleComplex(divisor.real(), divisor.imag())) {}
+
+  // implements
+  __host__ __device__ std::complex<double> operator()(
+      const std::complex<double>& x) const {
+    auto result = cuCdiv(make_cuDoubleComplex(x.real(), x.imag()), divisor);
+    return std::complex<double>(result.x, result.y);
+  }
+};
+
+template <>
+struct DividesBy<float, Eigen::half> {
+  float divisor;
+
+  __host__ __device__ explicit DividesBy(float divisor) : divisor(divisor) {}
+
+  __host__ __device__ Eigen::half operator()(const float& x) const {
+    return Eigen::half(x / divisor);
+  }
+};
+
+struct HalfToFloat {
+  __host__ __device__ float operator()(const Eigen::half& x) const {
+    return Eigen::half_impl::half_to_float(x);
+  }
+};
+
+struct FloatToHalf {
+  __host__ __device__ Eigen::half operator()(const float& x) const {
+    return Eigen::half_impl::float_to_half_rtne(x);
+  }
+};
+
+struct And {
+  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
+    return a && b;
+  }
+};
+
+struct Or {
+  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
+    return a || b;
+  }
+};
+
+// each block does a grid strided loop and reduces its values locally
+// the case of one block is used for low latency small reductions to scalars
+template <typename T, typename outT, int num_threads, typename Op>
+__global__ void BlockReduceKernel(
+    T in, outT out, int num_elems, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  const int bid = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  const int gid = bid * blockDim.x + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  typedef typename std::iterator_traits<T>::value_type value_type;
+
+  value_type sum = initVal;
+  if (gid < num_elems) {
+    sum = in[gid];
+    for (int pos = gid + stride; pos < num_elems; pos += stride) {
+      sum = op(sum, in[pos]);
+    }
+  }
+
+  typedef cub::BlockReduce<value_type, num_threads> BlockReduce;
+
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // only include input values in the reduction
+  //
+  // elements: -----------------
+  // grid:     |====|====|====|====|====|
+  const int num_elements_to_reduce =
+      max(min(num_elems - bid * blockDim.x, num_threads), 0);
+
+  sum = BlockReduce(temp_storage)
+            .template Reduce(sum, op, num_elements_to_reduce);
+
+  if (tid == 0) out[bid] = sum;
+}
+
+// maps a warp to each row
+template <typename T, typename outT, typename Op>
+__global__ void RowReduceKernel(
+    T in, outT out, int num_rows, int num_cols, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  const int lane = threadIdx.x % 32;
+
+  if (num_cols == 1) {
+    int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (gid < num_rows) out[gid] = in[gid];
+    return;
+  }
+
+  value_type sum = initVal;
+  int col = lane;
+
+  if (row < num_rows && col < num_cols) {
+    sum = in[row * num_cols + col];
+    col += 32;
+    for (; col < num_cols; col += 32) {
+      sum = op(sum, in[row * num_cols + col]);
+    }
+  }
+
+  typedef cub::WarpReduce<value_type> WarpReduce;
+
+  __shared__ typename WarpReduce::TempStorage temp_storage;
+
+  sum = WarpReduce(temp_storage).template Reduce(sum, op, min(num_cols, 32));
+
+  if (row < num_rows && lane == 0) out[row] = sum;
+}
+
+// Works only if there are <= 16 columns
+// each warps sums over multiple rows at once
+template <typename T, typename outT, typename Op>
+__global__ void ColumnReduceMax16ColumnsKernel(
+    T in, outT out, int num_rows, int num_cols, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  int rows_per_warp = 32 / num_cols;
+
+  const int lane = threadIdx.x % 32;
+  const int lane_row = lane / num_cols;
+
+  const int start_row_warp =
+      rows_per_warp * (blockIdx.y * blockDim.y + threadIdx.y);
+  const int start_row_lane = start_row_warp + lane_row;
+  int row = start_row_lane;
+  int col = lane % num_cols;
+
+  value_type sum = initVal;
+  if (row * num_cols + col < num_rows * num_cols)
+    sum = in[row * num_cols + col];
+
+  __shared__ value_type partial_sums[32][33];
+
+  row += rows_per_warp * gridDim.y * blockDim.y;
+  for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
+    int global_pos = row * num_cols + col;
+    if (global_pos < (num_rows * num_cols))
+      sum = op(sum, in[row * num_cols + col]);
+  }
+
+  const int rows_in_this_warp = min(rows_per_warp, num_rows - start_row_warp);
+  // not the most efficient way to do this sum
+  for (int i = 1; i < rows_in_this_warp; ++i) {
+    value_type tmp =
+        cub::ShuffleIndex(sum, threadIdx.x + i * num_cols, 32, 0xffffffff);
+    if (lane < num_cols) sum = op(sum, tmp);
+  }
+
+  if (lane < num_cols) partial_sums[lane][threadIdx.y] = sum;
+
+  __syncthreads();
+
+  if (threadIdx.y == 0 && threadIdx.x < num_cols) {
+    value_type s = partial_sums[threadIdx.x][0];
+
+    if (blockDim.y > 1) {
+      for (int row = 1; row < blockDim.y; ++row) {
+        s = op(s, partial_sums[threadIdx.x][row]);
+      }
+    }
+
+    out[col * gridDim.y + blockIdx.y] = s;
+  }
+}
+
+// Maps each block to a column range 32 wide
+template <typename T, typename outT, typename Op>
+__global__ void ColumnReduceKernel(
+    T in, outT out, int num_rows, int num_cols, Op op,
+    typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int col = blockIdx.x * 32 + threadIdx.x;
+
+  value_type sum = initVal;
+  if (row < num_rows && col < num_cols)
+    sum = in[row * num_cols + col];
+
+  __shared__ value_type partial_sums[32][33];
+
+  row += gridDim.y * blockDim.y;
+
+  if (col < num_cols) {
+    for (; row < num_rows; row += gridDim.y * blockDim.y) {
+      sum = op(sum, in[row * num_cols + col]);
+    }
+  }
+
+  partial_sums[threadIdx.x][threadIdx.y] = sum;
+
+  __syncthreads();
+
+  if (threadIdx.y == 0 && col < num_cols) {
+    value_type s = partial_sums[threadIdx.x][0];
+
+    // only include input values in the reduction
+    // elem   block_rows
+    //  -         =
+    //  -         =
+    //  #         #  block boundary
+    //  -         =
+    //  -         =
+    //  #         #  block boundary
+    //  -         =
+    //            =
+    const int numRowsThisBlock =
+        min(blockDim.y, num_rows - blockIdx.y * blockDim.y);
+
+    for (int row = 1; row < numRowsThisBlock; ++row) {
+      s = op(s, partial_sums[threadIdx.x][row]);
+    }
+
+    out[col * gridDim.y + blockIdx.y] = s;
+  }
+}
+
+// does multiple warp size segmented reductions in parallel
+// segments cannot cross warp boundaries (mainly used for reducing the segments
+// that come from the Max16Columns column reduction kernel)
+template <typename T, typename outT, typename Op>
+__global__ void CleanupSegments(
+    T partial_sums, outT out, int num_rows, int num_cols, int segment_size,
+    Op op, typename std::iterator_traits<T>::value_type initVal) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  value_type val = initVal;
+  if (tid < segment_size * num_cols)
+    val = partial_sums[tid];
+
+  typedef cub::WarpReduce<value_type> WarpReduce;
+
+  __shared__ typename WarpReduce::TempStorage temp_storage;
+
+  const bool head_flag = (threadIdx.x % segment_size) == 0;
+  value_type sum =
+      WarpReduce(temp_storage).HeadSegmentedReduce(val, head_flag, op);
+
+  if (head_flag && tid < segment_size * num_cols) {
+    out[tid / segment_size] = sum;
+  }
+}
+
+// assigns one thread to a column
+template <typename T, typename outT, typename Op>
+__global__ void ColumnReduceSimpleKernel(T in, outT out, int num_planes,
+                                         int num_rows, int num_cols, Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int elems_per_plane = num_rows * num_cols;
+
+  const int plane = gid / num_cols;
+  const int col = gid % num_cols;
+
+  if (plane >= num_planes) return;
+
+  if (num_rows == 1) {
+    out[plane * elems_per_plane + col] = in[plane * elems_per_plane + col];
+    return;
+  }
+
+  value_type sum = op(in[plane * elems_per_plane + col],
+                      in[plane * elems_per_plane + num_cols + col]);
+  for (int row = 2; row < num_rows; ++row) {
+    sum = op(sum, in[plane * elems_per_plane + row * num_cols + col]);
+  }
+
+  out[plane * num_cols + col] = sum;
+}
+
+struct RowOffset {
+  __host__ __device__ explicit RowOffset(const int& cols) : cols_(cols) {}
+
+  __host__ __device__ int operator()(const int& x) const { return cols_ * x; }
+
+  int cols_;
+};
+
+struct GatherOp {
+  __host__ __device__ GatherOp(const int& extent_x, const int& extent_y,
+                               const int& extent_z, bool kOne)
+      : extent_x_(extent_x),
+        extent_y_(extent_y),
+        extent_z_(extent_z),
+        kOne_(kOne) {
+    if (kOne_)
+      group_size_ = extent_y_;
+    else
+      group_size_ = extent_x_ * extent_z_;
+  }
+
+  __host__ __device__ int operator()(const int& ind) const {
+    const int group = kOne_ ? ind / group_size_ : ind % group_size_;
+    const int offset = kOne_ ? ind % group_size_ : ind / group_size_;
+
+    const int x = group / extent_z_;
+    const int z = group % extent_z_;
+
+    return x * extent_y_ * extent_z_ + z + offset * extent_z_;
+  }
+
+  int extent_x_;
+  int extent_y_;
+  int extent_z_;
+  bool kOne_;
+  int group_size_;
+};
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
+                           int in_size, Op op, T init,
+                           const cudaStream_t& cu_stream) {
+  // handle situations where low latency is important better than CUB
+  if (in_size <= 4096) {
+    const int num_blocks = 1;
+    const int num_threads = 256;
+    BlockReduceKernel<IN_T, OUT_T, num_threads>
+        <<<num_blocks, num_threads, 0, cu_stream>>>(in, out, in_size, op, init);
+    return;
+  } else if (in_size <= 1 << 19) {
+    const int num_threads = 256;
+    const int num_blocks = min(32, Eigen::divup(in_size, num_threads));
+    // it seems like tailoring this to the GPU
+    // would be more effective, but all attempts
+    // at making this a multiple of the number of
+    // multiprocessors have lead to lower perf
+    // in general
+    // TODO(eriche) investigate this more
+
+    Tensor temp_storage;
+    OP_REQUIRES_OK(
+        ctx,
+        ctx->allocate_temp(
+            DT_INT8, TensorShape({static_cast<int64>(num_blocks * sizeof(T))}),
+            &temp_storage));
+
+    BlockReduceKernel<IN_T, T*, num_threads>
+        <<<num_blocks, num_threads, 0, cu_stream>>>(
+            in, (T*)temp_storage.flat<int8_t>().data(), in_size, op, init);
+
+    // take care that we only reduce blocks that had some valid elements in them
+    // TODO(eriche): CUB currently has a bug in HeadSegmentedReduce that
+    // requires it to be used with a full warp.  Can reduce 32 -> num_blocks
+    // when this is fixed.
+    CleanupSegments<<<1, 32, 0, cu_stream>>>(
+        (T*)temp_storage.flat<int8_t>().data(), out, 1, 1, num_blocks, op,
+        init);
+    return;
+  }
+  std::size_t temp_storage_bytes = 0;
+
+  Tensor temp_storage;
+  // written as a loop because it reduces clutter
+  // first pass allocates memory, second launches kernel(s)
+  for (int i = 0; i < 2; ++i) {
+    auto success = cub::DeviceReduce::Reduce(
+        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
+        temp_storage_bytes, in, out, in_size, op, init, cu_stream);
+
+    OP_REQUIRES(
+        ctx, success == 0,
+        errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+
+    if (i == 0)
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+              &temp_storage));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
+                        int num_cols, Op op, T init,
+                        const cudaStream_t& cu_stream) {
+  if (num_cols < 1024) {
+    const int threads_per_block = 128;
+    const int warps_per_block = threads_per_block / 32;
+    int num_blocks = (num_rows + warps_per_block - 1) / warps_per_block;
+
+    RowReduceKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
+        in, out, num_rows, num_cols, op, init);
+    return;
+  }
+
+  // setup segment offsets with counting and transform iterator
+  RowOffset row_offset_op(num_cols);
+  cub::CountingInputIterator<int> counting_iter(0);
+  cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
+      transform_iter(counting_iter, row_offset_op);
+
+  std::size_t temp_storage_bytes = 0;
+  Tensor temp_storage;
+  for (int i = 0; i < 2; ++i) {
+    auto success = cub::DeviceSegmentedReduce::Reduce(
+        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
+        temp_storage_bytes, in, out, num_rows, transform_iter,
+        transform_iter + 1, op, init, cu_stream);
+
+    OP_REQUIRES(ctx, success == 0,
+                errors::Internal("CUB segmented reduce error",
+                                 cudaGetErrorString(success)));
+
+    if (i == 0)
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+              &temp_storage));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
+                                     int extent_x, int extent_y, Op op, T init,
+                                     const cudaStream_t& cu_stream) {
+  int rows_per_warp = 32 / extent_y;
+  dim3 block_dim(32, min(Eigen::divup(extent_x, rows_per_warp), 32), 1);
+  dim3 grid_dim(1,
+                Eigen::divup(static_cast<unsigned int>(extent_x),
+                             rows_per_warp * block_dim.y),
+                1);
+
+  grid_dim.y = min((int)grid_dim.y, 32);
+
+  if (grid_dim.y > 2 && grid_dim.y < 32) {
+    int log2 = Log2Floor(grid_dim.y);
+    grid_dim.y = 1 << log2;
+  }
+
+  if (grid_dim.y == 1) {
+    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
+        in, out, extent_x, extent_y, op, init);
+  } else {
+    Tensor temp_storage;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_INT8,
+                                      TensorShape({static_cast<int64>(
+                                          sizeof(T) * extent_y * grid_dim.y)}),
+                                      &temp_storage));
+    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
+        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op,
+        init);
+
+    dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
+    dim3 num_threads(128, 1, 1);
+    CleanupSegments<<<new_grid_dim, num_threads, 0, cu_stream>>>(
+        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
+        grid_dim.y, op, init);
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
+                                       int extent_x, int extent_y, Op op,
+                                       T init, const cudaStream_t& cu_stream) {
+  dim3 block_dim(32, min(extent_x, 32), 1);
+  dim3 grid_dim((extent_y + 31) / 32, 1, 1);
+
+  if (grid_dim.x < 16) grid_dim.y = min((extent_x + 31) / 32, 32);
+
+  if (grid_dim.y > 2 && grid_dim.y < 32) {
+    int log2 = Log2Floor(grid_dim.y);
+    grid_dim.y = 1 << log2;
+  }
+
+  if (grid_dim.y == 1) {
+    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
+        in, out, extent_x, extent_y, op, init);
+  } else {
+    Tensor temp_storage;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_INT8,
+                                      TensorShape({static_cast<int64>(
+                                          sizeof(T) * extent_y * grid_dim.y)}),
+                                      &temp_storage));
+
+    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
+        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op,
+        init);
+
+    dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
+    dim3 num_threads(128, 1, 1);
+    CleanupSegments<<<new_grid_dim, block_dim, 0, cu_stream>>>(
+        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
+        grid_dim.y, op, init);
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
+                           int extent_x, int extent_y, Op op, T init,
+                           const cudaStream_t& cu_stream) {
+  if (extent_y <= 16) {
+    LaunchColumnReduction_LTE16Cols(ctx, out, in, extent_x, extent_y, op, init,
+                                    cu_stream);
+  } else if (extent_y <= 4096) {
+    LaunchColumnReduction_LTE4096Cols(ctx, out, in, extent_x, extent_y, op,
+                                      init, cu_stream);
+  } else {
+    int threads_per_block = 128;
+    int num_blocks = Eigen::divup(extent_y, threads_per_block);
+
+    ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
+        in, out, 1, extent_x, extent_y, op);
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void Launch3DYReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
+                        int extent_y, int extent_z, Op op, T init,
+                        const cudaStream_t& cu_stream) {
+  int threads_per_block = 128;
+  int num_blocks =
+      (extent_x * extent_z + threads_per_block - 1) / threads_per_block;
+
+  // TODO(eriche): this won't be very good in the case of small x
+  //                small z and large y.
+  ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
+      in, out, extent_x, extent_y, extent_z, op);
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
+                         int extent_y, int extent_z, Op op, T init,
+                         const cudaStream_t& cu_stream) {
+  // setup segment offsets with counting and transform iterator
+  RowOffset row_offset_op(extent_x * extent_z);
+  cub::CountingInputIterator<int> counting_iter(0);
+  cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
+      transform_iter(counting_iter, row_offset_op);
+
+  GatherOp gather_op(extent_x, extent_y, extent_z, false);
+  typedef cub::TransformInputIterator<int, GatherOp,
+                                      cub::CountingInputIterator<int>>
+      gatherIterType;
+  gatherIterType gather_iter(counting_iter, gather_op);
+
+  PermutationInputIterator<T, IN_T, gatherIterType> permute_iter(in,
+                                                                 gather_iter);
+
+  std::size_t temp_storage_bytes = 0;
+  Tensor temp_storage;
+
+  for (int i = 0; i < 2; ++i) {
+    auto success = cub::DeviceSegmentedReduce::Reduce(
+        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
+        temp_storage_bytes, permute_iter, out, extent_y, transform_iter,
+        transform_iter + 1, op, init, cu_stream);
+
+    OP_REQUIRES(ctx, success == 0,
+                errors::Internal("CUB segmented reduce error",
+                                 cudaGetErrorString(success)));
+
+    if (i == 0)
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+              &temp_storage));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T,
+          typename ReductionAxes>
+void ReduceImpl(OpKernelContext* ctx, OUT_T out, IN_T in, int in_rank,
+                int in_dim0, int in_dim1, int in_dim2, int out_rank,
+                const ReductionAxes& reduction_axes, Op op, T init) {
+  const cudaStream_t& cu_stream = GetCudaStream(ctx);
+  if (out_rank == 0) {
+    const int in_size = in_dim0 * in_dim1 * in_dim2;
+    LaunchScalarReduction(ctx, out, in, in_size, op, init, cu_stream);
+  } else if (in_rank == 2 && out_rank == 1 &&
+             reduction_axes[0] == 1) {  // row reduction
+    LaunchRowReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
+  } else if (in_rank == 2 && out_rank == 1 &&
+             reduction_axes[0] == 0) {  // column reduction
+    LaunchColumnReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
+  } else if (in_rank == 3 && out_rank == 2 && reduction_axes[0] == 1) {
+    Launch3DYReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
+                       cu_stream);
+  } else if (in_rank == 3 && out_rank == 1 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2) {
+    Launch3DXZReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
+                        cu_stream);
+  } else {
+    std::stringstream ss;
+    ss << "Invalid reduction requested: in_rank, out_rank, axes " << in_rank
+       << " " << out_rank;
+    if (out_rank == 1) ss << " " << reduction_axes[0];
+    if (out_rank == 2) ss << " " << reduction_axes[1];
+    LOG(FATAL) << ss.str();
+  }
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index 9cdebdd4f23..9bbe993a2f9 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -22,14 +23,59 @@ namespace tensorflow {
 
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
-static Graph* ToScalar(const string& reduce, int num) {
-  Graph* g = new Graph(OpRegistry::Global());
-  Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
-  data.flat<float>().setRandom();
-  Tensor axes(DT_INT32, TensorShape({3}));
+template <typename T>
+static Graph* ToScalar(const string& reduce, int num_x, int num_y) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x, num_y}));
+  data.flat<T>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({2}));
   axes.flat<int32>()(0) = 0;
   axes.flat<int32>()(1) = 1;
-  axes.flat<int32>()(2) = 2;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ColReduce(const string& reduce, int num_x, int num_y) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({1}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* RowReduce(const string& reduce, int num_x, int num_y) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({1}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({1}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({2}));
+  axes.flat<int32>()(0) = 0;
+  axes.flat<int32>()(1) = 2;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
@@ -37,51 +83,100 @@ static Graph* ToScalar(const string& reduce, int num) {
 
 // Creates a bench which reduces a 3D tensor with total "num" floats
 // into a scalar on a "device". Runs the bench for "iters" times.
+template <typename T>
 static void ReduceToScalar(int iters, const string& device,
-                           const string& reduce, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float));
-  test::Benchmark(device, ToScalar(reduce, num)).Run(iters);
+                           const string& reduce, int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(T));
+  test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
 }
 
-static void BM_Sum3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Sum", num);
+static void DoRowReduce(int iters, const string& device, const string& reduce,
+                        int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Max3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Max", num);
+static void DoColReduce(int iters, const string& device, const string& reduce,
+                        int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Prod3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Prod", num);
+static void Do3DYReduce(int iters, const string& device, const string& reduce,
+                        int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Mean3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Mean", num);
+static void Do3DXZReduce(int iters, const string& device, const string& reduce,
+                         int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Sum3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Sum", num);
+static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Max3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Max", num);
+static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
+  ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Prod3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Prod", num);
+static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
+  ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Mean3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Mean", num);
+static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
+  DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
+  DoColReduce(iters, "gpu", "Sum", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
+  Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
+  Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
+}
+BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
+static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
+}
+BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
+static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
+}
+BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
+static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
+}
+BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 12b10d61da8..98f3718c128 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -431,7 +431,16 @@ class ResourceGatherOp : public OpKernel {
 TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 
+// Registers GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GATHER_GPU);
+
+#endif  // GOOGLE_CUDA
+
 #undef REGISTER_GATHER_CPU
+#undef REGISTER_GATHER_GPU
 #undef REGISTER_GATHER_ALL_INDICES
 #undef REGISTER_GATHER_FULL
 
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 80d49017406..6b06cf650a8 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -216,9 +216,12 @@ void RestoreTensor(OpKernelContext* context,
 
   if (output_shape.num_elements() == 0) return;
 
-#define READER_COPY(T)                                                      \
-  case DataTypeToEnum<T>::value:                                            \
-    reader->CopySliceData(tensor_name, slice_to_load, t->flat<T>().data()); \
+#define READER_COPY(T)                                                \
+  case DataTypeToEnum<T>::value:                                      \
+    OP_REQUIRES(context,                                              \
+                reader->CopySliceData(tensor_name, slice_to_load,     \
+                                      t->flat<T>().data()),           \
+                errors::InvalidArgument("Error copying slice data")); \
     break;
 
   switch (type) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 9cdbe89457c..8f7eff113cd 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // See docs in ../ops/math_ops.cc.
 
 #define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
 #include <vector>
@@ -32,6 +35,15 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/util.h"
 
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/platform/cuda.h"
+
+using ::perftools::gputools::cuda::ScopedActivateExecutorContext;
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -183,6 +195,105 @@ class SegmentReductionOp : public OpKernel {
   }
 };
 
+#ifdef GOOGLE_CUDA
+//  SegmentSumGPUOp is a segment sum operator implemented for GPU only.
+//  TODO: This implementation of SegmentSumGPUOp is sometimes slower than
+//  its unsorted counterpart (mostly when problem size is small).
+//  This is due to the following two main reasons and a cost-effective way
+//  to resolve these problems is desirable.
+//  1. Sorted segment sum requires a memory transfer from device to host in
+//     order to know the size of the output dimension whereas unsorted segment
+//     sum receives the size of the output dimension as an input parameter.
+//  2. Sorted segment sum is essentially a tiled version of unsorted segment
+//     sum and therefore such optimization comes at an inherent cost. However
+//     such cost may not be justified when the problem size is small. When to
+//     use the tiled version or the untiled version depends on many factors
+//     including data alignments, ratio of calculation to memory traffic and
+//     obviously, the problem sizes.
+template <class T, class Index>
+class SegmentSumGPUOp : public AsyncOpKernel {
+ public:
+  explicit SegmentSumGPUOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    const Tensor& input = context->input(0);
+    const Tensor& segment_ids = context->input(1);
+
+    OP_REQUIRES_ASYNC(
+        context, TensorShapeUtils::IsVector(segment_ids.shape()),
+        errors::InvalidArgument("segment_ids should be a vector."), done);
+
+    const int64 num_indices = segment_ids.NumElements();
+    OP_REQUIRES_ASYNC(
+        context, num_indices == input.dim_size(0),
+        errors::InvalidArgument(
+            "segment_ids should be the same size as dimension 0 of"
+            " input."),
+        done);
+
+    if (num_indices == 0) {
+      TensorShape output_shape = input.shape();
+      output_shape.set_dim(0, 0);
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(0, output_shape, &output), done);
+      done();
+      return;
+    }
+
+    perftools::gputools::DeviceMemoryBase output_rows_device(
+        (void*)(segment_ids.template flat<Index>().data() + (num_indices - 1)));
+    ScratchSpace<Index> output_rows_host(context, 1, /* on_host */ true);
+
+    auto stream = context->op_device_context()->stream();
+    OP_REQUIRES_ASYNC(
+        context, stream
+                     ->ThenMemcpy(output_rows_host.mutable_data(),
+                                  output_rows_device, sizeof(Index))
+                     .ok(),
+        errors::Internal(
+            "SegmentSumGPUOp: failed to copy output_rows from device"),
+        done);
+
+    functor::SegmentSumFunctor<T, Index> functor_;
+    auto create_and_check_output = [context, output_rows_host, &input,
+                                    &segment_ids, &functor_, done]() {
+      // Ensure that within the callback, the proper GPU settings are
+      // configured.
+      auto stream = context->op_device_context()->stream();
+      ScopedActivateExecutorContext scoped_activation{stream->parent()};
+
+      Index output_rows = *output_rows_host.data();
+      output_rows++;
+      OP_REQUIRES_ASYNC(context, output_rows > 0,
+                        errors::InvalidArgument("segment ids must be >= 0"),
+                        done);
+
+      TensorShape output_shape = input.shape();
+      output_shape.set_dim(0, output_rows);
+
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          context, context->allocate_output(0, output_shape, &output), done);
+
+      auto output_flat = output->flat_outer_dims<T>();
+      auto data_ptr = input.template flat<T>().data();
+      auto segment_flat = segment_ids.flat<Index>();
+      functor_(context, context->eigen_device<GPUDevice>(), output_rows,
+               segment_ids.shape(), segment_flat, input.NumElements(), data_ptr,
+               output_flat);
+
+      done();
+    };
+
+    context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+        stream, create_and_check_output);
+  }
+};
+#endif  // GOOGLE_CUDA
+
 #define REGISTER_CPU_KERNEL_SEGMENT(name, functor, type, index_type, \
                                     default_value)                   \
   REGISTER_KERNEL_BUILDER(                                           \
@@ -227,6 +338,23 @@ REGISTER_COMPLEX_CPU_KERNELS_ALL(complex128);
 #undef REGISTER_REAL_CPU_KERNELS_ALL
 #undef REGISTER_COMPLEX_CPU_KERNELS_ALL
 
+#if GOOGLE_CUDA
+#define REGISTER_GPU_SORTED_KERNELS(type, index_type)                  \
+  REGISTER_KERNEL_BUILDER(Name("SegmentSum")                           \
+                              .Device(DEVICE_GPU)                      \
+                              .TypeConstraint<type>("T")               \
+                              .TypeConstraint<index_type>("Tindices"), \
+                          SegmentSumGPUOp<type, index_type>)
+
+#define REGISTER_GPU_SORTED_KERNELS_ALL(type) \
+  REGISTER_GPU_SORTED_KERNELS(type, int32);   \
+  REGISTER_GPU_SORTED_KERNELS(type, int64);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
+#undef REGISTER_GPU_SORTED_KERNELS
+#undef REGISTER_GPU_SORTED_KERNELS_ALL
+#endif  // GOOGLE_CUDA
+
 namespace functor {
 
 // UnsortedSegmentSumFunctor implementation for CPUDevice.
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index ee09c213b7c..412c1d601d3 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -26,6 +26,28 @@ namespace tensorflow {
 class OpKernelContext;
 
 namespace functor {
+
+#ifdef GOOGLE_CUDA
+typedef Eigen::GpuDevice GPUDevice;
+// Functor for SegmentSumGPUOp.
+// 'output_rows': the number of output segments (unique segment ids in
+//                'segment_ids').
+// 'segment_ids_shape': shape of 'segment_ids' tensor.
+// 'segment_ids': unsorted map from input to output segment ids at which to
+//                perform segment sum operation.
+// 'data_size': size of input data tensor.
+// 'data': input data tensor.
+// 'output': output reshaped to {output_rows, output.size/output_rows}
+template <typename T, typename Index>
+struct SegmentSumFunctor {
+  void operator()(OpKernelContext* ctx, const GPUDevice& d,
+                  const Index output_rows, const TensorShape& segment_ids_shape,
+                  typename TTypes<Index>::ConstFlat segment_ids,
+                  const Index data_size, const T* data,
+                  typename TTypes<T, 2>::Tensor output);
+};
+#endif
+
 // BaseFunctor for definition of UnsorteSegmentReductionOp
 // for usage without templates.
 template <typename Device, typename T, typename Index>
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index b132b1e8f8b..26fcafee34a 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -54,6 +54,77 @@ __device__ __forceinline__ void AccumulateInto(
   CudaAtomicAdd(dest_scalar + 1, value.imag());
 }
 
+// SortedSegmentSumFunctor kernel reduces input data just as
+// UnsortedSegmentSumCustomKernel does except that input data
+// is partitioned along the outer reduction dimension. This is
+// because consecutive rows (elements in a row share the same
+// outer dimension index) in the flattened 2D input data likely
+// belong to the same segment in sorted segment sum operation.
+// Therefore such partitioning strategy has two advantages over
+// the UnsortedSegmentSumFunctor kernel:
+// 1. Each thread reduces across multiple rows before writing
+// answers to the global memory, we can therefore
+// write reduction results to global memory less often.
+// 2. We may know that the current thread is the only contributor
+// to an output element because of the increasing nature of segment
+// ids. In such cases, we do not need to use atomic operations
+// to write results to global memory.
+// In the flattened view of input data (with only outer and inner
+// dimension), every thread processes a strip of input data of
+// size OuterDimTileSize x 1. This strip runs across multiple
+// rows of input data and all reduction elements share one inner
+// dimension index.
+template <typename T, typename Index, int OuterDimTileSize>
+__global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
+                                             const Index inner_dim_size,
+                                             const Index output_outer_dim_size,
+                                             const Index* segment_ids,
+                                             const T* input, T* output,
+                                             const Index total_stripe_count) {
+  CUDA_1D_KERNEL_LOOP(stripe_index, total_stripe_count) {
+    const Index segment_offset = stripe_index % inner_dim_size;
+    const Index input_outer_dim_index_base =
+        stripe_index / inner_dim_size * Index(OuterDimTileSize);
+
+    T sum = T(0);
+    Index first_segment_id = segment_ids[input_outer_dim_index_base];
+    Index last_output_segment_id = output_outer_dim_size;
+
+    const Index actual_stripe_height =
+        min(Index(OuterDimTileSize),
+            input_outer_dim_size - input_outer_dim_index_base);
+    for (Index j = 0; j < actual_stripe_height; j++) {
+      Index current_output_segment_id =
+          segment_ids[input_outer_dim_index_base + j];
+      // Decide whether to write result to global memory.
+      // Result is only written to global memory if we move
+      // to another segment. Otherwise we can keep accumulating
+      // locally.
+      if (current_output_segment_id > last_output_segment_id) {
+        const Index output_index =
+            last_output_segment_id * inner_dim_size + segment_offset;
+        // decide whether to write result to global memory using atomic
+        // operations
+        if (last_output_segment_id == first_segment_id) {
+          AccumulateInto<T>(output + output_index, sum);
+        } else {
+          *(output + output_index) = sum;
+        }
+        sum = T(0);
+      }
+      sum += ldg(input + (input_outer_dim_index_base + j) * inner_dim_size +
+                 segment_offset);
+      last_output_segment_id = current_output_segment_id;
+    }
+    // For the last result in a strip, always write using atomic operations
+    // due to possible race conditions with threads computing
+    // the following strip.
+    const Index output_index =
+        last_output_segment_id * inner_dim_size + segment_offset;
+    AccumulateInto<T>(output + output_index, sum);
+  }
+}
+
 // UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
@@ -80,6 +151,47 @@ __global__ void UnsortedSegmentSumCustomKernel(
 
 namespace functor {
 
+template <typename T, typename Index>
+void SegmentSumFunctor<T, Index>::operator()(
+    OpKernelContext* ctx, const GPUDevice& d, const Index output_rows,
+    const TensorShape& segment_ids_shape,
+    typename TTypes<Index>::ConstFlat segment_ids, const Index data_size,
+    const T* data, typename TTypes<T, 2>::Tensor output) {
+  if (output.size() == 0) {
+    return;
+  }
+  // Set 'output' to zeros.
+  CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
+  SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      output.size(), output.data());
+  if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
+    return;
+  }
+
+  // Launch kernel to compute sorted segment sum.
+  // Notes:
+  // *) 'input_total_size' is the total number of elements to process.
+  // *) 'segment_ids.shape' is a prefix of data's shape.
+  // *) 'input_outer_dim_size' is the total number of segments to process.
+  const Index input_total_size = data_size;
+  const Index input_outer_dim_size = segment_ids.dimension(0);
+  const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
+
+  const int OuterDimTileSize = 8;
+
+  const Index input_outer_dim_num_stripe =
+      Eigen::divup(input_outer_dim_size, Index(OuterDimTileSize));
+
+  const Index total_stripe_count =
+      input_inner_dim_size * input_outer_dim_num_stripe;
+
+  config = GetCudaLaunchConfig(total_stripe_count, d);
+  SortedSegmentSumCustomKernel<T, Index, OuterDimTileSize><<<
+      config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      input_outer_dim_size, input_inner_dim_size, output_rows,
+      segment_ids.data(), data, output.data(), total_stripe_count);
+};
+
 // UnsortedSegmentSumFunctor implementation for GPUDevice.
 template <typename T, typename Index>
 struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
@@ -117,6 +229,15 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
   }
 };
 
+#define DEFINE_SORTED_GPU_SPECS_INDEX(T, Index) \
+  template struct SegmentSumFunctor<T, Index>
+
+#define DEFINE_SORTED_GPU_SPECS(T)         \
+  DEFINE_SORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_SORTED_GPU_SPECS_INDEX(T, int64);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS);
+
 #define DEFINE_GPU_SPECS_INDEX(T, Index) \
   template struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
 
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
index a305598fe2b..96eaa4ac75b 100644
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ b/tensorflow/core/kernels/shape_op_test.cc
@@ -101,7 +101,7 @@ TEST_F(ShapeOpTest, Simple) {
     Tensor variant_tensor(DT_VARIANT, TensorShape({1}));
     Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs);
     EXPECT_FALSE(s.ok());
-    ExpectHasError(s, "Shape of non-scalar Variant not supported.");
+    ExpectHasError(s, "Shape of non-unary Variant not supported.");
   }
 
   {
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 0c39d46aeaf..ac607f4e8b8 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -35,7 +35,7 @@ inline Status GetRegularOrVariantShape(OpKernelContext* ctx, int input_index,
   if (ctx->input_dtype(0) == DT_VARIANT) {
     if (inp.dims() != 0) {
       return errors::InvalidArgument(
-          "Shape of non-scalar Variant not supported.");
+          "Shape of non-unary Variant not supported.");
     }
     TF_RETURN_IF_ERROR(GetUnaryVariantShape(inp, shape));
   } else {
diff --git a/tensorflow/core/kernels/summary_interface.cc b/tensorflow/core/kernels/summary_interface.cc
new file mode 100644
index 00000000000..19e0f702f9f
--- /dev/null
+++ b/tensorflow/core/kernels/summary_interface.cc
@@ -0,0 +1,432 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/summary_interface.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/util/event.pb.h"
+#include "tensorflow/core/util/events_writer.h"
+
+namespace tensorflow {
+namespace {
+template <typename T>
+Status TensorValueAt(Tensor t, int index, T* out) {
+  switch (t.dtype()) {
+    case DT_FLOAT:
+      *out = t.flat<float>()(index);
+      break;
+    case DT_DOUBLE:
+      *out = t.flat<double>()(index);
+      break;
+    case DT_HALF:
+      *out = T(t.flat<Eigen::half>()(index));
+      break;
+    case DT_INT32:
+      *out = t.flat<int32>()(index);
+      break;
+    case DT_UINT8:
+      *out = t.flat<uint8>()(index);
+      break;
+    case DT_INT16:
+      *out = t.flat<int16>()(index);
+      break;
+    case DT_INT8:
+      *out = t.flat<int8>()(index);
+      break;
+    case DT_BOOL:
+      *out = t.flat<bool>()(index);
+      break;
+    case DT_INT64:
+      *out = t.flat<int64>()(index);
+      break;
+    default:
+      return errors::Unimplemented("Scalar summary for dtype ",
+                                   DataTypeString(t.dtype()),
+                                   " is not supported.");
+  }
+  return Status::OK();
+}
+
+typedef Eigen::Tensor<uint8, 2, Eigen::RowMajor> Uint8Image;
+
+// Add the sequence of images specified by ith_image to the summary.
+//
+// Factoring this loop out into a helper function lets ith_image behave
+// differently in the float and uint8 cases: the float case needs a temporary
+// buffer which can be shared across calls to ith_image, but the uint8 case
+// does not.
+Status AddImages(const string& tag, int max_images, int batch_size, int w,
+                 int h, int depth,
+                 const std::function<Uint8Image(int)>& ith_image, Summary* s) {
+  const int N = std::min<int>(max_images, batch_size);
+  for (int i = 0; i < N; ++i) {
+    Summary::Value* v = s->add_value();
+    // The tag depends on the number of requested images (not the number
+    // produced.)
+    //
+    // Note that later on avisu uses "/" to figure out a consistent naming
+    // convention for display, so we append "/image" to guarantee that the
+    // image(s) won't be displayed in the global scope with no name.
+    if (max_images > 1) {
+      v->set_tag(strings::StrCat(tag, "/image/", i));
+    } else {
+      v->set_tag(strings::StrCat(tag, "/image"));
+    }
+
+    auto image = ith_image(i);
+    Summary::Image* si = v->mutable_image();
+    si->set_height(h);
+    si->set_width(w);
+    si->set_colorspace(depth);
+    const int channel_bits = 8;
+    const int compression = -1;  // Use zlib default
+    if (!png::WriteImageToBuffer(image.data(), w, h, w * depth, depth,
+                                 channel_bits, compression,
+                                 si->mutable_encoded_image_string(), nullptr)) {
+      return errors::Internal("PNG encoding failed");
+    }
+  }
+  return Status::OK();
+}
+
+template <class T>
+void NormalizeFloatImage(int hw, int depth,
+                         typename TTypes<T>::ConstMatrix values,
+                         typename TTypes<uint8>::ConstVec bad_color,
+                         Uint8Image* image) {
+  if (!image->size()) return;  // Nothing to do for empty images
+
+  // Rescale the image to uint8 range.
+  //
+  // We are trying to generate an RGB image from a float/half tensor.  We do
+  // not have any info about the expected range of values in the tensor
+  // but the generated image needs to have all RGB values within [0, 255].
+  //
+  // We use two different algorithms to generate these values.  If the
+  // tensor has only positive values we scale them all by 255/max(values).
+  // If the tensor has both negative and positive values we scale them by
+  // the max of their absolute values and center them around 127.
+  //
+  // This works for most cases, but does not respect the relative dynamic
+  // range across different instances of the tensor.
+
+  // Compute min and max ignoring nonfinite pixels
+  float image_min = std::numeric_limits<float>::infinity();
+  float image_max = -image_min;
+  for (int i = 0; i < hw; i++) {
+    bool finite = true;
+    for (int j = 0; j < depth; j++) {
+      if (!Eigen::numext::isfinite(values(i, j))) {
+        finite = false;
+        break;
+      }
+    }
+    if (finite) {
+      for (int j = 0; j < depth; j++) {
+        float value(values(i, j));
+        image_min = std::min(image_min, value);
+        image_max = std::max(image_max, value);
+      }
+    }
+  }
+
+  // Pick an affine transform into uint8
+  const float kZeroThreshold = 1e-6;
+  T scale, offset;
+  if (image_min < 0) {
+    float max_val = std::max(std::abs(image_min), std::abs(image_max));
+    scale = T(max_val < kZeroThreshold ? 0.0f : 127.0f / max_val);
+    offset = T(128.0f);
+  } else {
+    scale = T(image_max < kZeroThreshold ? 0.0f : 255.0f / image_max);
+    offset = T(0.0f);
+  }
+
+  // Transform image, turning nonfinite values to bad_color
+  for (int i = 0; i < hw; i++) {
+    bool finite = true;
+    for (int j = 0; j < depth; j++) {
+      if (!Eigen::numext::isfinite(values(i, j))) {
+        finite = false;
+        break;
+      }
+    }
+    if (finite) {
+      image->chip<0>(i) =
+          (values.template chip<0>(i) * scale + offset).template cast<uint8>();
+    } else {
+      image->chip<0>(i) = bad_color;
+    }
+  }
+}
+
+template <class T>
+Status NormalizeAndAddImages(const Tensor& tensor, int max_images, int h, int w,
+                             int hw, int depth, int batch_size,
+                             const string& base_tag, Tensor bad_color_tensor,
+                             Summary* s) {
+  // For float and half images, nans and infs are replaced with bad_color.
+  if (bad_color_tensor.dim_size(0) < depth) {
+    return errors::InvalidArgument(
+        "expected depth <= bad_color.size, got depth = ", depth,
+        ", bad_color.size = ", bad_color_tensor.dim_size(0));
+  }
+  auto bad_color_full = bad_color_tensor.vec<uint8>();
+  typename TTypes<uint8>::ConstVec bad_color(bad_color_full.data(), depth);
+
+  // Float images must be scaled and translated.
+  Uint8Image image(hw, depth);
+  auto ith_image = [&tensor, &image, bad_color, batch_size, hw, depth](int i) {
+    auto tensor_eigen = tensor.template shaped<T, 3>({batch_size, hw, depth});
+    typename TTypes<T>::ConstMatrix values(
+        &tensor_eigen(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
+    NormalizeFloatImage<T>(hw, depth, values, bad_color, &image);
+    return image;
+  };
+  return AddImages(base_tag, max_images, batch_size, w, h, depth, ith_image, s);
+}
+
+}  // namespace
+
+class SummaryWriterImpl : public SummaryWriterInterface {
+ public:
+  SummaryWriterImpl(int max_queue, int flush_millis)
+      : SummaryWriterInterface(),
+        max_queue_(max_queue),
+        flush_millis_(flush_millis) {}
+
+  Status Initialize(const string& logdir, const string& filename_suffix,
+                    Env* env) {
+    Status is_dir = env->IsDirectory(logdir);
+    if (!is_dir.ok()) {
+      if (is_dir.code() != tensorflow::error::NOT_FOUND) {
+        return is_dir;
+      }
+      TF_RETURN_IF_ERROR(env->CreateDir(logdir));
+    }
+    mutex_lock ml(mu_);
+    events_writer_ =
+        xla::MakeUnique<EventsWriter>(io::JoinPath(logdir, "events"));
+    if (!events_writer_->InitWithSuffix(filename_suffix)) {
+      return errors::Unknown("Could not initialize events writer.");
+    }
+    last_flush_ = Env::Default()->NowMicros();
+    return Status::OK();
+  }
+
+  Status Flush() override {
+    mutex_lock ml(mu_);
+    return InternalFlush();
+  }
+
+  ~SummaryWriterImpl() override {
+    (void)Flush();  // Ignore errors.
+  }
+
+  Status WriteTensor(int64 global_step, Tensor t, const string& tag,
+                     const string& serialized_metadata) override {
+    Summary s;
+    Summary::Value* v = s.add_value();
+    t.AsProtoTensorContent(v->mutable_tensor());
+    v->set_tag(tag);
+    v->mutable_metadata()->ParseFromString(serialized_metadata);
+    return Enqueue(global_step, s);
+  }
+
+  Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
+    Summary s;
+    Summary::Value* v = s.add_value();
+    v->set_tag(tag);
+    float value;
+    TF_RETURN_IF_ERROR(TensorValueAt<float>(t, 0, &value));
+    v->set_simple_value(value);
+    return Enqueue(global_step, s);
+  }
+
+  Status WriteHistogram(int64 global_step, Tensor t,
+                        const string& tag) override {
+    Summary s;
+    Summary::Value* v = s.add_value();
+    v->set_tag(tag);
+    histogram::Histogram histo;
+    for (int64 i = 0; i < t.NumElements(); i++) {
+      double double_val;
+      TF_RETURN_IF_ERROR(TensorValueAt<double>(t, i, &double_val));
+      if (Eigen::numext::isnan(double_val)) {
+        return errors::InvalidArgument("Nan in summary histogram for: ", tag);
+      } else if (Eigen::numext::isinf(double_val)) {
+        return errors::InvalidArgument("Infinity in summary histogram for: ",
+                                       tag);
+      }
+      histo.Add(double_val);
+    }
+
+    histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
+    return Enqueue(global_step, s);
+  }
+
+  Status WriteImage(int64 global_step, Tensor tensor, const string& tag,
+                    int max_images, Tensor bad_color) override {
+    if (!(tensor.dims() == 4 &&
+          (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+           tensor.dim_size(3) == 4))) {
+      return errors::InvalidArgument(
+          "Tensor must be 4-D with last dim 1, 3, or 4, not ",
+          tensor.shape().DebugString());
+    }
+    if (!(tensor.dim_size(0) < (1LL << 31) &&
+          tensor.dim_size(1) < (1LL << 31) &&
+          tensor.dim_size(2) < (1LL << 31) &&
+          (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29))) {
+      return errors::InvalidArgument("Tensor too large for summary ",
+                                     tensor.shape().DebugString());
+    }
+    Summary s;
+    // The casts and h * w cannot overflow because of the limits above.
+    const int batch_size = static_cast<int>(tensor.dim_size(0));
+    const int h = static_cast<int>(tensor.dim_size(1));
+    const int w = static_cast<int>(tensor.dim_size(2));
+    const int hw = h * w;  // Compact these two dims for simplicity
+    const int depth = static_cast<int>(tensor.dim_size(3));
+    if (tensor.dtype() == DT_UINT8) {
+      // For uint8 input, no normalization is necessary
+      auto ith_image = [&tensor, batch_size, hw, depth](int i) {
+        auto values = tensor.shaped<uint8, 3>({batch_size, hw, depth});
+        return typename TTypes<uint8>::ConstMatrix(
+            &values(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
+      };
+      TF_RETURN_IF_ERROR(
+          AddImages(tag, max_images, batch_size, w, h, depth, ith_image, &s));
+    } else if (tensor.dtype() == DT_HALF) {
+      TF_RETURN_IF_ERROR(NormalizeAndAddImages<Eigen::half>(
+          tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, &s));
+    } else if (tensor.dtype() == DT_FLOAT) {
+      TF_RETURN_IF_ERROR(NormalizeAndAddImages<float>(
+          tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, &s));
+    } else {
+      return errors::InvalidArgument(
+          "Only DT_INT8, DT_HALF, and DT_FLOAT images are supported. Got ",
+          DataTypeString(tensor.dtype()));
+    }
+
+    return Enqueue(global_step, s);
+  }
+
+  Status WriteAudio(int64 global_step, Tensor tensor, const string& tag,
+                    int max_outputs, float sample_rate) override {
+    if (sample_rate <= 0.0f) {
+      return errors::InvalidArgument("sample_rate must be > 0");
+    }
+    const int batch_size = tensor.dim_size(0);
+    const int64 length_frames = tensor.dim_size(1);
+    const int64 num_channels =
+        tensor.dims() == 2 ? 1 : tensor.dim_size(tensor.dims() - 1);
+    Summary s;
+    const int N = std::min<int>(max_outputs, batch_size);
+    for (int i = 0; i < N; ++i) {
+      Summary::Value* v = s.add_value();
+      if (max_outputs > 1) {
+        v->set_tag(strings::StrCat(tag, "/audio/", i));
+      } else {
+        v->set_tag(strings::StrCat(tag, "/audio"));
+      }
+
+      Summary::Audio* sa = v->mutable_audio();
+      sa->set_sample_rate(sample_rate);
+      sa->set_num_channels(num_channels);
+      sa->set_length_frames(length_frames);
+      sa->set_content_type("audio/wav");
+
+      auto values =
+          tensor.shaped<float, 3>({batch_size, length_frames, num_channels});
+      auto channels_by_frames = typename TTypes<float>::ConstMatrix(
+          &values(i, 0, 0),
+          Eigen::DSizes<Eigen::DenseIndex, 2>(length_frames, num_channels));
+      size_t sample_rate_truncated = lrintf(sample_rate);
+      if (sample_rate_truncated == 0) {
+        sample_rate_truncated = 1;
+      }
+      TF_RETURN_IF_ERROR(wav::EncodeAudioAsS16LEWav(
+          channels_by_frames.data(), sample_rate_truncated, num_channels,
+          length_frames, sa->mutable_encoded_audio_string()));
+    }
+
+    return Enqueue(global_step, s);
+  }
+
+  string DebugString() override { return "SummaryWriterImpl"; }
+
+ private:
+  Status Enqueue(int64 global_step, const Summary& summary) {
+    mutex_lock ml(mu_);
+    queue_.emplace_back(global_step, summary, Env::Default()->NowMicros());
+    if (queue_.size() >= max_queue_ ||
+        Env::Default()->NowMicros() - last_flush_ > 1000 * flush_millis_) {
+      return InternalFlush();
+    }
+    return Status::OK();
+  }
+
+  Status InternalFlush() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    for (const EventInfo& e : queue_) {
+      Event event;
+      event.set_step(std::get<0>(e));
+      *event.mutable_summary() = std::get<1>(e);
+      event.set_wall_time(std::get<2>(e));
+      events_writer_->WriteEvent(event);
+    }
+    queue_.clear();
+    if (!events_writer_->Flush()) {
+      return errors::InvalidArgument("Could not flush events file.");
+    }
+    last_flush_ = Env::Default()->NowMicros();
+    return Status::OK();
+  }
+
+  const int max_queue_;
+  const int flush_millis_;
+  uint64 last_flush_;
+  using EventInfo = std::tuple<int64, Summary, int64>;
+  mutex mu_;
+  std::vector<EventInfo> queue_ GUARDED_BY(mu_);
+  // A pointer to allow deferred construction.
+  std::unique_ptr<EventsWriter> events_writer_ GUARDED_BY(mu_);
+  std::vector<std::pair<string, SummaryMetadata>> registered_summaries_
+      GUARDED_BY(mu_);
+};
+
+Status CreateSummaryWriter(int max_queue, int flush_millis,
+                           const string& logdir, const string& filename_suffix,
+                           Env* env, SummaryWriterInterface** result) {
+  SummaryWriterImpl* w = new SummaryWriterImpl(max_queue, flush_millis);
+  Status s = w->Initialize(logdir, filename_suffix, env);
+  if (!s.ok()) {
+    w->Unref();
+    *result = nullptr;
+    return s;
+  }
+  *result = w;
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h
new file mode 100644
index 00000000000..ae2fbb70fe3
--- /dev/null
+++ b/tensorflow/core/kernels/summary_interface.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_
+#define TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_
+
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+// Main interface for the summary writer resource.
+class SummaryWriterInterface : public ResourceBase {
+ public:
+  virtual ~SummaryWriterInterface() override {}
+
+  // Flushes all unwritten messages in the queue.
+  virtual Status Flush() = 0;
+
+  // These are called in the OpKernel::Compute methods for the summary ops.
+  virtual Status WriteTensor(int64 global_step, Tensor t, const string& tag,
+                             const string& serialized_metadata) = 0;
+
+  virtual Status WriteScalar(int64 global_step, Tensor t,
+                             const string& tag) = 0;
+
+  virtual Status WriteHistogram(int64 global_step, Tensor t,
+                                const string& tag) = 0;
+
+  virtual Status WriteImage(int64 global_step, Tensor t, const string& tag,
+                            int max_images, Tensor bad_color) = 0;
+
+  virtual Status WriteAudio(int64 global_step, Tensor t, const string& tag,
+                            int max_outputs_, float sample_rate) = 0;
+};
+
+// Creates a SummaryWriterInterface instance which writes to a file. It will
+// enqueue up to max_queue summaries, and flush at least every flush_millis
+// milliseconds. The summaries will be written to the directory specified by
+// logdir and with the filename suffixed by filename_suffix. The caller owns a
+// reference to result if the returned status is ok.
+Status CreateSummaryWriter(int max_queue, int flush_millis,
+                           const string& logdir, const string& filename_suffix,
+                           Env* env, SummaryWriterInterface** result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_
diff --git a/tensorflow/core/kernels/summary_interface_test.cc b/tensorflow/core/kernels/summary_interface_test.cc
new file mode 100644
index 00000000000..0e24e8122a0
--- /dev/null
+++ b/tensorflow/core/kernels/summary_interface_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/kernels/summary_interface.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/event.pb.h"
+
+namespace tensorflow {
+namespace {
+
+Status SummaryTestHelper(
+    const string& test_name,
+    std::function<Status(SummaryWriterInterface*)> writer_fn,
+    std::function<void(const Event&)> test_fn) {
+  static std::set<string>* tests = new std::set<string>();
+  CHECK(tests->insert(test_name).second) << ": " << test_name;
+
+  SummaryWriterInterface* writer;
+  Env* env = Env::Default();
+  TF_CHECK_OK(
+      CreateSummaryWriter(1, 1, testing::TmpDir(), test_name, env, &writer));
+  core::ScopedUnref deleter(writer);
+
+  TF_CHECK_OK(writer_fn(writer));
+  TF_CHECK_OK(writer->Flush());
+
+  std::vector<string> files;
+  TF_CHECK_OK(env->GetChildren(testing::TmpDir(), &files));
+  bool found = false;
+  for (const string& f : files) {
+    if (StringPiece(f).contains(test_name)) {
+      if (found) {
+        return errors::Unknown("Found more than one file for ", test_name);
+      }
+      found = true;
+      std::unique_ptr<RandomAccessFile> read_file;
+      TF_CHECK_OK(env->NewRandomAccessFile(io::JoinPath(testing::TmpDir(), f),
+                                           &read_file));
+      io::RecordReader reader(read_file.get(), io::RecordReaderOptions());
+      string record;
+      uint64 offset = 0;
+      TF_CHECK_OK(reader.ReadRecord(&offset,
+                                    &record));  // The first event is irrelevant
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      Event e;
+      e.ParseFromString(record);
+      test_fn(e);
+    }
+  }
+  if (!found) {
+    return errors::Unknown("Found no file for ", test_name);
+  }
+  return Status::OK();
+}
+
+TEST(SummaryInterfaceTest, WriteTensor) {
+  TF_CHECK_OK(SummaryTestHelper("tensor_test",
+                                [](SummaryWriterInterface* writer) {
+                                  Tensor one(DT_FLOAT, TensorShape({}));
+                                  one.scalar<float>()() = 1.0;
+                                  TF_RETURN_IF_ERROR(writer->WriteTensor(
+                                      2, one, "name",
+                                      SummaryMetadata().SerializeAsString()));
+                                  TF_RETURN_IF_ERROR(writer->Flush());
+                                  return Status::OK();
+                                },
+                                [](const Event& e) {
+                                  EXPECT_EQ(e.step(), 2);
+                                  CHECK_EQ(e.summary().value_size(), 1);
+                                  EXPECT_EQ(e.summary().value(0).tag(), "name");
+                                }));
+}
+
+TEST(SummaryInterfaceTest, WriteScalar) {
+  TF_CHECK_OK(SummaryTestHelper(
+      "scalar_test",
+      [](SummaryWriterInterface* writer) {
+        Tensor one(DT_FLOAT, TensorShape({}));
+        one.scalar<float>()() = 1.0;
+        TF_RETURN_IF_ERROR(writer->WriteScalar(2, one, "name"));
+        TF_RETURN_IF_ERROR(writer->Flush());
+        return Status::OK();
+      },
+      [](const Event& e) {
+        EXPECT_EQ(e.step(), 2);
+        CHECK_EQ(e.summary().value_size(), 1);
+        EXPECT_EQ(e.summary().value(0).tag(), "name");
+        EXPECT_EQ(e.summary().value(0).simple_value(), 1.0);
+      }));
+}
+
+TEST(SummaryInterfaceTest, WriteHistogram) {
+  TF_CHECK_OK(SummaryTestHelper("hist_test",
+                                [](SummaryWriterInterface* writer) {
+                                  Tensor one(DT_FLOAT, TensorShape({}));
+                                  one.scalar<float>()() = 1.0;
+                                  TF_RETURN_IF_ERROR(
+                                      writer->WriteHistogram(2, one, "name"));
+                                  TF_RETURN_IF_ERROR(writer->Flush());
+                                  return Status::OK();
+                                },
+                                [](const Event& e) {
+                                  EXPECT_EQ(e.step(), 2);
+                                  CHECK_EQ(e.summary().value_size(), 1);
+                                  EXPECT_EQ(e.summary().value(0).tag(), "name");
+                                  EXPECT_TRUE(e.summary().value(0).has_histo());
+                                }));
+}
+
+TEST(SummaryInterfaceTest, WriteImage) {
+  TF_CHECK_OK(SummaryTestHelper(
+      "image_test",
+      [](SummaryWriterInterface* writer) {
+        Tensor one(DT_UINT8, TensorShape({1, 1, 1, 1}));
+        one.scalar<int8>()() = 1;
+        TF_RETURN_IF_ERROR(writer->WriteImage(2, one, "name", 1, Tensor()));
+        TF_RETURN_IF_ERROR(writer->Flush());
+        return Status::OK();
+      },
+      [](const Event& e) {
+        EXPECT_EQ(e.step(), 2);
+        CHECK_EQ(e.summary().value_size(), 1);
+        EXPECT_EQ(e.summary().value(0).tag(), "name/image");
+        CHECK(e.summary().value(0).has_image());
+        EXPECT_EQ(e.summary().value(0).image().height(), 1);
+        EXPECT_EQ(e.summary().value(0).image().width(), 1);
+        EXPECT_EQ(e.summary().value(0).image().colorspace(), 1);
+      }));
+}
+
+TEST(SummaryInterfaceTest, WriteAudio) {
+  TF_CHECK_OK(SummaryTestHelper(
+      "audio_test",
+      [](SummaryWriterInterface* writer) {
+        Tensor one(DT_FLOAT, TensorShape({1, 1}));
+        one.scalar<float>()() = 1.0;
+        TF_RETURN_IF_ERROR(writer->WriteAudio(2, one, "name", 1, 1));
+        TF_RETURN_IF_ERROR(writer->Flush());
+        return Status::OK();
+      },
+      [](const Event& e) {
+        EXPECT_EQ(e.step(), 2);
+        CHECK_EQ(e.summary().value_size(), 1);
+        EXPECT_EQ(e.summary().value(0).tag(), "name/audio");
+        CHECK(e.summary().value(0).has_audio());
+      }));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
new file mode 100644
index 00000000000..d0eca0f1e7f
--- /dev/null
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -0,0 +1,226 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/summary_interface.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("SummaryWriter").Device(DEVICE_CPU),
+                        ResourceHandleOp<SummaryWriterInterface>);
+
+class CreateSummaryFileWriterOp : public OpKernel {
+ public:
+  explicit CreateSummaryFileWriterOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* tmp;
+    OP_REQUIRES_OK(ctx, ctx->input("logdir", &tmp));
+    const string logdir = tmp->scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->input("max_queue", &tmp));
+    const int32 max_queue = tmp->scalar<int32>()();
+    OP_REQUIRES_OK(ctx, ctx->input("flush_millis", &tmp));
+    const int32 flush_millis = tmp->scalar<int32>()();
+    OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp));
+    const string filename_suffix = tmp->scalar<string>()();
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, CreateSummaryWriter(max_queue, flush_millis, logdir,
+                                            filename_suffix, ctx->env(), &s));
+    Status status = CreateResource(ctx, HandleFromInput(ctx, 0), s);
+    if (!status.ok()) {
+      s->Unref();
+      ctx->SetStatus(status);
+      return;
+    }
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
+                        CreateSummaryFileWriterOp);
+
+class FlushSummaryWriterOp : public OpKernel {
+ public:
+  explicit FlushSummaryWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    OP_REQUIRES_OK(ctx, s->Flush());
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("FlushSummaryWriter").Device(DEVICE_CPU),
+                        FlushSummaryWriterOp);
+
+class CloseSummaryWriterOp : public OpKernel {
+ public:
+  explicit CloseSummaryWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OP_REQUIRES_OK(ctx, DeleteResource<SummaryWriterInterface>(
+                            ctx, HandleFromInput(ctx, 0)));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("CloseSummaryWriter").Device(DEVICE_CPU),
+                        CloseSummaryWriterOp);
+
+class WriteSummaryOp : public OpKernel {
+ public:
+  explicit WriteSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    const Tensor* tmp;
+    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
+    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
+    const string& tag = tmp->scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->input("summary_metadata", &tmp));
+    const string& serialized_metadata = tmp->scalar<string>()();
+
+    const Tensor* t;
+    OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
+
+    OP_REQUIRES_OK(ctx,
+                   s->WriteTensor(global_step, *t, tag, serialized_metadata));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("WriteSummary").Device(DEVICE_CPU),
+                        WriteSummaryOp);
+
+class WriteScalarSummaryOp : public OpKernel {
+ public:
+  explicit WriteScalarSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    const Tensor* tmp;
+    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
+    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
+    const string& tag = tmp->scalar<string>()();
+
+    const Tensor* t;
+    OP_REQUIRES_OK(ctx, ctx->input("value", &t));
+
+    OP_REQUIRES_OK(ctx, s->WriteScalar(global_step, *t, tag));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("WriteScalarSummary").Device(DEVICE_CPU),
+                        WriteScalarSummaryOp);
+
+class WriteHistogramSummaryOp : public OpKernel {
+ public:
+  explicit WriteHistogramSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    const Tensor* tmp;
+    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
+    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
+    const string& tag = tmp->scalar<string>()();
+
+    const Tensor* t;
+    OP_REQUIRES_OK(ctx, ctx->input("values", &t));
+
+    OP_REQUIRES_OK(ctx, s->WriteHistogram(global_step, *t, tag));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("WriteHistogramSummary").Device(DEVICE_CPU),
+                        WriteHistogramSummaryOp);
+
+class WriteImageSummaryOp : public OpKernel {
+ public:
+  explicit WriteImageSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    int64 max_images_tmp;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_images", &max_images_tmp));
+    OP_REQUIRES(ctx, max_images_tmp < (1LL << 31),
+                errors::InvalidArgument("max_images must be < 2^31"));
+    max_images_ = static_cast<int32>(max_images_tmp);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    const Tensor* tmp;
+    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
+    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
+    const string& tag = tmp->scalar<string>()();
+    const Tensor* bad_color;
+    OP_REQUIRES_OK(ctx, ctx->input("bad_color", &bad_color));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(bad_color->shape()),
+        errors::InvalidArgument("bad_color must be a vector, got shape ",
+                                bad_color->shape().DebugString()));
+
+    const Tensor* t;
+    OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
+
+    OP_REQUIRES_OK(
+        ctx, s->WriteImage(global_step, *t, tag, max_images_, *bad_color));
+  }
+
+ private:
+  int32 max_images_;
+};
+REGISTER_KERNEL_BUILDER(Name("WriteImageSummary").Device(DEVICE_CPU),
+                        WriteImageSummaryOp);
+
+class WriteAudioSummaryOp : public OpKernel {
+ public:
+  explicit WriteAudioSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_outputs", &max_outputs_));
+    OP_REQUIRES(ctx, max_outputs_ > 0,
+                errors::InvalidArgument("max_outputs must be > 0"));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    const Tensor* tmp;
+    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
+    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
+    const string& tag = tmp->scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->input("sample_rate", &tmp));
+    const float sample_rate = tmp->scalar<float>()();
+
+    const Tensor* t;
+    OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
+
+    OP_REQUIRES_OK(
+        ctx, s->WriteAudio(global_step, *t, tag, max_outputs_, sample_rate));
+  }
+
+ private:
+  int max_outputs_;
+  bool has_sample_rate_attr_;
+  float sample_rate_attr_;
+};
+REGISTER_KERNEL_BUILDER(Name("WriteAudioSummary").Device(DEVICE_CPU),
+                        WriteAudioSummaryOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index b14e5551039..36b8ff09d73 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -83,7 +83,6 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
                           IsVariableInitializedOp);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-TF_CALL_bool(REGISTER_GPU_KERNELS)
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index 258793aa1e6..3c7e5ca696d 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -70,13 +70,24 @@ class FewerArgsForCompiler {
   int stride_;
 };
 
+// Check whether the crop window is valid, assuming crop is true.
+bool IsCropWindowValid(const UncompressFlags& flags, int input_image_width,
+                       int input_image_height) {
+  // Crop window is valid only if it is non zero and all the window region is
+  // within the original image.
+  return flags.crop_width > 0 && flags.crop_height > 0 && flags.crop_x >= 0 &&
+         flags.crop_y >= 0 &&
+         flags.crop_y + flags.crop_height <= input_image_height &&
+         flags.crop_x + flags.crop_width <= input_image_width;
+}
+
 uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   // unpack the argball
   const int datasize = argball->datasize_;
   const auto& flags = argball->flags_;
   const int ratio = flags.ratio;
   int components = flags.components;
-  int stride = flags.stride;            // may be 0
+  int stride = flags.stride;              // may be 0
   int64* const nwarn = argball->pnwarn_;  // may be NULL
 
   // Can't decode if the ratio is not recognized by libjpeg
@@ -159,8 +170,43 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
     return nullptr;
   }
 
+  JDIMENSION target_output_width = cinfo.output_width;
+  JDIMENSION target_output_height = cinfo.output_height;
+  JDIMENSION skipped_scanlines = 0;
+#if !defined(WIN32)
+  if (flags.crop) {
+    // Update target output height and width based on crop window.
+    target_output_height = flags.crop_height;
+    target_output_width = flags.crop_width;
+
+    // So far, cinfo holds the original input image information.
+    if (!IsCropWindowValid(flags, cinfo.output_width, cinfo.output_height)) {
+      LOG(ERROR) << "Invalid crop window: x=" << flags.crop_x
+                 << ", y=" << flags.crop_y << ", w=" << target_output_width
+                 << ", h=" << target_output_height
+                 << " for image_width: " << cinfo.output_width
+                 << " and image_height: " << cinfo.output_height;
+      jpeg_destroy_decompress(&cinfo);
+      return nullptr;
+    }
+
+    // Update cinfo.output_width. It is tricky that cinfo.output_width must
+    // fall on an Minimum Coded Unit (MCU) boundary; if it doesn't, then it will
+    // be moved left to the nearest MCU boundary, and width will be increased
+    // accordingly. Therefore, the final cinfo.crop_width might differ from the
+    // given flags.crop_width. Please see libjpeg library for details.
+    JDIMENSION crop_width = flags.crop_width;
+    JDIMENSION crop_x = flags.crop_x;
+    jpeg_crop_scanline(&cinfo, &crop_x, &crop_width);
+
+    // Update cinfo.output_scanline.
+    skipped_scanlines = jpeg_skip_scanlines(&cinfo, flags.crop_y);
+    CHECK_EQ(skipped_scanlines, flags.crop_y);
+  }
+#endif
+
   // check for compatible stride
-  const int min_stride = cinfo.output_width * components * sizeof(JSAMPLE);
+  const int min_stride = target_output_width * components * sizeof(JSAMPLE);
   if (stride == 0) {
     stride = min_stride;
   } else if (stride < min_stride) {
@@ -170,47 +216,88 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   }
 
   // Remember stride and height for use in Uncompress
-  argball->height_ = cinfo.output_height;
+  argball->height_ = target_output_height;
   argball->stride_ = stride;
 
-  uint8* const dstdata = argball->allocate_output_(
-      cinfo.output_width, cinfo.output_height, components);
+#if defined(WIN32)
+  uint8* dstdata = nullptr;
+  if (flags.crop) {
+    dstdata = new JSAMPLE[stride * target_output_height];
+  } else {
+    dstdata = argball->allocate_output_(target_output_width,
+                                        target_output_height, components);
+  }
+#else
+  uint8* dstdata = argball->allocate_output_(target_output_width,
+                                             target_output_height, components);
+#endif
   if (dstdata == nullptr) {
     jpeg_destroy_decompress(&cinfo);
     return nullptr;
   }
   JSAMPLE* output_line = static_cast<JSAMPLE*>(dstdata);
 
-  // Temporary buffer used for CMYK -> RGB conversion.
+  // jpeg_read_scanlines requires the buffers to be allocated based on
+  // cinfo.output_width, but the target image width might be different if crop
+  // is enabled and crop_width is not MCU aligned. In this case, we need to
+  // realign the scanline output to achieve the exact cropping.  Notably, only
+  // cinfo.output_width needs to fall on MCU boundary, while cinfo.output_height
+  // has no such constraint.
+  const bool need_realign_cropped_scanline =
+      (target_output_width != cinfo.output_width);
   const bool use_cmyk = (cinfo.out_color_space == JCS_CMYK);
-  tempdata = use_cmyk ? new JSAMPLE[cinfo.output_width * 4] : nullptr;
+
+  if (use_cmyk) {
+    // Temporary buffer used for CMYK -> RGB conversion.
+    tempdata = new JSAMPLE[cinfo.output_width * 4];
+  } else if (need_realign_cropped_scanline) {
+    // Temporary buffer used for MCU-aligned scanline data.
+    tempdata = new JSAMPLE[cinfo.output_width * components];
+  }
 
   // If there is an error reading a line, this aborts the reading.
   // Save the fraction of the image that has been read.
-  argball->height_read_ = cinfo.output_height;
-  while (cinfo.output_scanline < cinfo.output_height) {
+  argball->height_read_ = target_output_height;
+
+  // These variables are just to avoid repeated computation in the loop.
+  const int max_scanlines_to_read = skipped_scanlines + target_output_height;
+  const int mcu_align_offset =
+      (cinfo.output_width - target_output_width) * (use_cmyk ? 4 : components);
+  while (cinfo.output_scanline < max_scanlines_to_read) {
     int num_lines_read = 0;
-    if (cinfo.out_color_space == JCS_CMYK) {
+    if (use_cmyk) {
       num_lines_read = jpeg_read_scanlines(&cinfo, &tempdata, 1);
-      // Convert CMYK to RGB
-      for (size_t i = 0; i < cinfo.output_width; ++i) {
-        int c = tempdata[4 * i + 0];
-        int m = tempdata[4 * i + 1];
-        int y = tempdata[4 * i + 2];
-        int k = tempdata[4 * i + 3];
-        int r, g, b;
-        if (cinfo.saw_Adobe_marker) {
-          r = (k * c) / 255;
-          g = (k * m) / 255;
-          b = (k * y) / 255;
-        } else {
-          r = (255 - k) * (255 - c) / 255;
-          g = (255 - k) * (255 - m) / 255;
-          b = (255 - k) * (255 - y) / 255;
+      if (num_lines_read > 0) {
+        // Convert CMYK to RGB if scanline read succeeded.
+        for (size_t i = 0; i < target_output_width; ++i) {
+          int offset = 4 * i;
+          if (need_realign_cropped_scanline) {
+            // Align the offset for MCU boundary.
+            offset += mcu_align_offset;
+          }
+          const int c = tempdata[offset + 0];
+          const int m = tempdata[offset + 1];
+          const int y = tempdata[offset + 2];
+          const int k = tempdata[offset + 3];
+          int r, g, b;
+          if (cinfo.saw_Adobe_marker) {
+            r = (k * c) / 255;
+            g = (k * m) / 255;
+            b = (k * y) / 255;
+          } else {
+            r = (255 - k) * (255 - c) / 255;
+            g = (255 - k) * (255 - m) / 255;
+            b = (255 - k) * (255 - y) / 255;
+          }
+          output_line[3 * i + 0] = r;
+          output_line[3 * i + 1] = g;
+          output_line[3 * i + 2] = b;
         }
-        output_line[3 * i + 0] = r;
-        output_line[3 * i + 1] = g;
-        output_line[3 * i + 2] = b;
+      }
+    } else if (need_realign_cropped_scanline) {
+      num_lines_read = jpeg_read_scanlines(&cinfo, &tempdata, 1);
+      if (num_lines_read > 0) {
+        memcpy(output_line, tempdata + mcu_align_offset, min_stride);
       }
     } else {
       num_lines_read = jpeg_read_scanlines(&cinfo, &output_line, 1);
@@ -218,12 +305,13 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
     // Handle error cases
     if (num_lines_read == 0) {
       LOG(ERROR) << "Premature end of JPEG data. Stopped at line "
-                 << cinfo.output_scanline << "/" << cinfo.output_height;
+                 << cinfo.output_scanline - skipped_scanlines << "/"
+                 << target_output_height;
       if (!flags.try_recover_truncated_jpeg) {
-        argball->height_read_ = cinfo.output_scanline;
+        argball->height_read_ = cinfo.output_scanline - skipped_scanlines;
         error = JPEGERRORS_UNEXPECTED_END_OF_DATA;
       } else {
-        for (size_t line = cinfo.output_scanline; line < cinfo.output_height;
+        for (size_t line = cinfo.output_scanline; line < max_scanlines_to_read;
              ++line) {
           if (line == 0) {
             // If even the first line is missing, fill with black color
@@ -235,9 +323,9 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
           output_line += stride;
         }
         argball->height_read_ =
-            cinfo.output_height;  // consider all lines as read
+            target_output_height;  // consider all lines as read
         // prevent error-on-exit in libjpeg:
-        cinfo.output_scanline = cinfo.output_height;
+        cinfo.output_scanline = max_scanlines_to_read;
       }
       break;
     }
@@ -248,23 +336,33 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   delete[] tempdata;
   tempdata = nullptr;
 
+#if !defined(WIN32)
+  if (flags.crop && cinfo.output_scanline < cinfo.output_height) {
+    // Skip the rest of scanlines, required by jpeg_destroy_decompress.
+    jpeg_skip_scanlines(&cinfo,
+                        cinfo.output_height - flags.crop_y - flags.crop_height);
+    // After this, cinfo.output_height must be equal to cinfo.output_height;
+    // otherwise, jpeg_destroy_decompress would fail.
+  }
+#endif
+
   // Convert the RGB data to RGBA, with alpha set to 0xFF to indicate
   // opacity.
   // RGBRGBRGB... --> RGBARGBARGBA...
   if (components == 4) {
     // Start on the last line.
     JSAMPLE* scanlineptr = static_cast<JSAMPLE*>(
-        dstdata + static_cast<int64>(cinfo.output_height - 1) * stride);
+        dstdata + static_cast<int64>(target_output_height - 1) * stride);
     const JSAMPLE kOpaque = -1;  // All ones appropriate for JSAMPLE.
-    const int right_rgb = (cinfo.output_width - 1) * 3;
-    const int right_rgba = (cinfo.output_width - 1) * 4;
+    const int right_rgb = (target_output_width - 1) * 3;
+    const int right_rgba = (target_output_width - 1) * 4;
 
-    for (int y = cinfo.output_height; y-- > 0;) {
+    for (int y = target_output_height; y-- > 0;) {
       // We do all the transformations in place, going backwards for each row.
       const JSAMPLE* rgb_pixel = scanlineptr + right_rgb;
       JSAMPLE* rgba_pixel = scanlineptr + right_rgba;
       scanlineptr -= stride;
-      for (int x = cinfo.output_width; x-- > 0;
+      for (int x = target_output_width; x-- > 0;
            rgba_pixel -= 4, rgb_pixel -= 3) {
         // We copy the 3 bytes at rgb_pixel into the 4 bytes at rgba_pixel
         // The "a" channel is set to be opaque.
@@ -319,8 +417,61 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
       LOG(ERROR) << "Unhandled case " << error;
       break;
   }
-  jpeg_destroy_decompress(&cinfo);
 
+#if defined(WIN32)
+  // TODO(tanmingxing): delete all these code after migrating to libjpeg_turbo
+  // for Windows.
+  if (flags.crop) {
+    // Update target output height and width based on crop window.
+    target_output_height = flags.crop_height;
+    target_output_width = flags.crop_width;
+
+    // cinfo holds the original input image information.
+    if (!IsCropWindowValid(flags, cinfo.output_width, cinfo.output_height)) {
+      LOG(ERROR) << "Invalid crop window: x=" << flags.crop_x
+                 << ", y=" << flags.crop_y << ", w=" << target_output_width
+                 << ", h=" << target_output_height
+                 << " for image_width: " << cinfo.output_width
+                 << " and image_height: " << cinfo.output_height;
+      delete[] dstdata;
+      jpeg_destroy_decompress(&cinfo);
+      return nullptr;
+    }
+
+    const uint8* full_image = dstdata;
+    dstdata = argball->allocate_output_(target_output_width,
+                                        target_output_height, components);
+    if (dstdata == nullptr) {
+      delete[] full_image;
+      jpeg_destroy_decompress(&cinfo);
+      return nullptr;
+    }
+
+    const int full_image_stride = stride;
+    // Update stride and hight for crop window.
+    const int min_stride = target_output_width * components * sizeof(JSAMPLE);
+    if (flags.stride == 0) {
+      stride = min_stride;
+    }
+    argball->height_ = target_output_height;
+    argball->stride_ = stride;
+
+    if (argball->height_read_ > target_output_height) {
+      argball->height_read_ = target_output_height;
+    }
+    const int crop_offset = flags.crop_x * components * sizeof(JSAMPLE);
+    const uint8* full_image_ptr = full_image + flags.crop_y * full_image_stride;
+    uint8* crop_image_ptr = dstdata;
+    for (int i = 0; i < argball->height_read_; i++) {
+      memcpy(crop_image_ptr, full_image_ptr + crop_offset, min_stride);
+      crop_image_ptr += stride;
+      full_image_ptr += full_image_stride;
+    }
+    delete[] full_image;
+  }
+#endif
+
+  jpeg_destroy_decompress(&cinfo);
   return dstdata;
 }
 
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.h b/tensorflow/core/lib/jpeg/jpeg_mem.h
index ac34f29f221..59342d28c0f 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.h
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.h
@@ -61,6 +61,17 @@ struct UncompressFlags {
   //
   // Setting this has a quality/speed trade-off implication.
   J_DCT_METHOD dct_method = JDCT_DEFAULT;
+
+  // Settings of crop window before decompression.
+  bool crop = false;
+  // Vertical coordinate of the top-left corner of the result in the input.
+  int crop_x = 0;
+  // Horizontal coordinate of the top-left corner of the result in the input.
+  int crop_y = 0;
+  // Width of the output image.
+  int crop_width = 0;
+  // Height of the output image.
+  int crop_height = 0;
 };
 
 // Uncompress some raw JPEG data given by the pointer srcdata and the length
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index cc8646750e1..15266af1dbd 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -57,7 +57,7 @@ void ReadFileToStringOrDie(Env* env, const string& filename, string* output) {
 void TestJPEG(Env* env, const string& jpegfile) {
   // Read the data from the jpeg file into memory
   string jpeg;
-  ReadFileToStringOrDie(Env::Default(), jpegfile, &jpeg);
+  ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
   const uint8* const temp = bit_cast<const uint8*>(jpeg.data());
 
@@ -95,6 +95,194 @@ TEST(JpegMemTest, Jpeg) {
   TestJPEG(env, data_path + "jpeg_merge_test1_cmyk.jpg");
 }
 
+void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
+                           const UncompressFlags& default_flags) {
+  // Read the data from the jpeg file into memory
+  string jpeg;
+  ReadFileToStringOrDie(env, jpegfile, &jpeg);
+  const int fsize = jpeg.size();
+  auto temp = bit_cast<const uint8*>(jpeg.data());
+
+  // Decode the whole image.
+  std::unique_ptr<uint8[]> imgdata1;
+  int w1, h1, c1;
+  {
+    UncompressFlags flags = default_flags;
+    if (flags.stride == 0) {
+      imgdata1.reset(Uncompress(temp, fsize, flags, &w1, &h1, &c1, nullptr));
+    } else {
+      // If stride is not zero, the default allocator would fail because it
+      // allocate w*h*c bytes, but the actual required bytes should be stride*h.
+      // Therefore, we provide a specialized allocator here.
+      uint8* buffer = nullptr;
+      imgdata1.reset(Uncompress(temp, fsize, flags, nullptr,
+                                [&](int width, int height, int components) {
+                                  w1 = width;
+                                  h1 = height;
+                                  c1 = components;
+                                  buffer = new uint8[flags.stride * height];
+                                  return buffer;
+                                }));
+    }
+    ASSERT_NE(imgdata1, nullptr);
+  }
+
+  auto check_crop_and_decode_func = [&](int crop_x, int crop_y, int crop_width,
+                                        int crop_height) {
+    std::unique_ptr<uint8[]> imgdata2;
+    int w, h, c;
+    UncompressFlags flags = default_flags;
+    flags.crop = true;
+    flags.crop_x = crop_x;
+    flags.crop_y = crop_y;
+    flags.crop_width = crop_width;
+    flags.crop_height = crop_height;
+    if (flags.stride == 0) {
+      imgdata2.reset(Uncompress(temp, fsize, flags, &w, &h, &c, nullptr));
+    } else {
+      uint8* buffer = nullptr;
+      imgdata2.reset(Uncompress(temp, fsize, flags, nullptr,
+                                [&](int width, int height, int components) {
+                                  w = width;
+                                  h = height;
+                                  c = components;
+                                  buffer = new uint8[flags.stride * height];
+                                  return buffer;
+                                }));
+    }
+    ASSERT_NE(imgdata2, nullptr);
+
+    ASSERT_EQ(w, crop_width);
+    ASSERT_EQ(h, crop_height);
+    ASSERT_EQ(c, c1);
+
+    const int stride1 = (flags.stride != 0) ? flags.stride : w1 * c;
+    const int stride2 = (flags.stride != 0) ? flags.stride : w * c;
+    for (int i = 0; i < crop_height; i++) {
+      const uint8* p1 = &imgdata1[(i + crop_y) * stride1 + crop_x * c];
+      const uint8* p2 = &imgdata2[i * stride2];
+
+      for (int j = 0; j < c * w; j++) {
+        ASSERT_EQ(p1[j], p2[j])
+            << "p1 != p2 in [" << i << "][" << j / 3 << "][" << j % 3 << "]";
+      }
+    }
+  };
+
+  // Check different crop windows.
+  check_crop_and_decode_func(0, 0, 5, 5);
+  check_crop_and_decode_func(0, 0, w1, 5);
+  check_crop_and_decode_func(0, 0, 5, h1);
+  check_crop_and_decode_func(0, 0, w1, h1);
+  check_crop_and_decode_func(w1 - 5, h1 - 6, 5, 6);
+  check_crop_and_decode_func(5, 6, 10, 15);
+}
+
+TEST(JpegMemTest, CropAndDecodeJpeg) {
+  Env* env = Env::Default();
+  const string data_path = kTestData;
+  UncompressFlags flags;
+
+  // Test basic flags for jpeg and cmyk jpeg.
+  TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
+  TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1_cmyk.jpg", flags);
+}
+
+TEST(JpegMemTest, CropAndDecodeJpegWithRatio) {
+  Env* env = Env::Default();
+  const string data_path = kTestData;
+  UncompressFlags flags;
+  for (int ratio : {1, 2, 4, 8}) {
+    flags.ratio = ratio;
+    TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
+  }
+}
+
+TEST(JpegMemTest, CropAndDecodeJpegWithComponents) {
+  Env* env = Env::Default();
+  const string data_path = kTestData;
+  UncompressFlags flags;
+  for (const int components : {0, 1, 3}) {
+    flags.components = components;
+    TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
+  }
+}
+
+TEST(JpegMemTest, CropAndDecodeJpegWithUpScaling) {
+  Env* env = Env::Default();
+  const string data_path = kTestData;
+  UncompressFlags flags;
+  flags.fancy_upscaling = true;
+  TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
+}
+
+TEST(JpegMemTest, CropAndDecodeJpegWithStride) {
+  Env* env = Env::Default();
+  const string data_path = kTestData;
+
+  // Read the data from the jpeg file into memory
+  string jpeg;
+  ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
+  const int fsize = jpeg.size();
+  auto temp = bit_cast<const uint8*>(jpeg.data());
+
+  int w, h, c;
+  ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
+
+  // stride must be either 0 or > w*c; otherwise, uncompress would fail.
+  UncompressFlags flags;
+  flags.stride = w * c;
+  TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
+  flags.stride = w * c * 3;
+  TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
+  flags.stride = w * c + 100;
+  TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
+}
+
+void CheckInvalidCropWindowFailed(const uint8* const temp, int fsize, int x,
+                                  int y, int w, int h) {
+  std::unique_ptr<uint8[]> imgdata;
+  int ww, hh, cc;
+  UncompressFlags flags;
+  flags.components = 3;
+  flags.crop = true;
+  flags.crop_x = x;
+  flags.crop_y = y;
+  flags.crop_width = w;
+  flags.crop_height = h;
+  imgdata.reset(Uncompress(temp, fsize, flags, &ww, &hh, &cc, nullptr));
+  CHECK(imgdata == nullptr);
+}
+
+TEST(JpegMemTest, CropAndDecodeJpegWithInvalidCropWindow) {
+  Env* env = Env::Default();
+  const string data_path = kTestData;
+
+  // Read the data from the jpeg file into memory
+  string jpeg;
+  ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
+  const int fsize = jpeg.size();
+  auto temp = bit_cast<const uint8*>(jpeg.data());
+
+  int w, h, c;
+  ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
+
+  // Width and height for the crop window must be non zero.
+  CheckInvalidCropWindowFailed(temp, fsize, 11, 11, /*w=*/0, 11);
+  CheckInvalidCropWindowFailed(temp, fsize, 11, 11, 11, /*h=*/0);
+
+  // Crop window must be non negative.
+  CheckInvalidCropWindowFailed(temp, fsize, /*x=*/-1, 11, 11, 11);
+  CheckInvalidCropWindowFailed(temp, fsize, 11, /*y=*/-1, 11, 11);
+  CheckInvalidCropWindowFailed(temp, fsize, 11, 11, /*w=*/-1, 11);
+  CheckInvalidCropWindowFailed(temp, fsize, 11, 11, 11, /*h=*/-1);
+
+  // Invalid crop window width: x + crop_width = w + 1 > w
+  CheckInvalidCropWindowFailed(temp, fsize, /*x=*/w - 10, 11, 11, 11);
+  // Invalid crop window height: y + crop_height= h + 1 > h
+  CheckInvalidCropWindowFailed(temp, fsize, 11, /*y=*/h - 10, 11, 11);
+}
+
 TEST(JpegMemTest, Jpeg2) {
   // create known data, for size in_w x in_h
   const int in_w = 256;
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index ad290d123e5..22d4a0056f8 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10467,8 +10467,8 @@ op {
     type_list_attr: "Treduce_func_other_arguments"
   }
   input_arg {
-    name: "window_size"
-    type: DT_INT64
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
     name: "handle"
@@ -10482,6 +10482,10 @@ op {
     name: "reduce_func"
     type: "func"
   }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
   attr {
     name: "Tkey_func_other_arguments"
     type: "list(type)"
@@ -10492,6 +10496,11 @@ op {
     type: "list(type)"
     has_minimum: true
   }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
   attr {
     name: "output_types"
     type: "list(type)"
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index f6bd5768d7c..37d9a737e29 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -237,12 +237,15 @@ REGISTER_OP("GroupByWindowDataset")
     .Input("input_dataset: resource")
     .Input("key_func_other_arguments: Tkey_func_other_arguments")
     .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input("window_size: int64")
+    .Input(
+        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
     .Output("handle: resource")
     .Attr("key_func: func")
     .Attr("reduce_func: func")
+    .Attr("window_size_func: func")
     .Attr("Tkey_func_other_arguments: list(type) >= 0")
     .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape)
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 6ff05bd2a62..6eb05874aa5 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -499,7 +499,7 @@ Returns x + y element-wise.
 )doc");
 
 REGISTER_OP("Sub")
-    .BINARY_FEWER()
+    .BINARY_MORE()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns x - y element-wise.
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 0a96258dd1f..8a2d5e8c05a 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1945,7 +1945,7 @@ Computes softsign gradients for a softsign operation.
 
 gradients: The backpropagated gradients to the corresponding softsign operation.
 features: The features passed as input to the corresponding softsign operation.
-backprops: The gradients: `gradients / (1 + abs(-features)) ** 2`.
+backprops: The gradients: `gradients / (1 + abs(features)) ** 2`.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -2791,7 +2791,9 @@ REGISTER_OP("_MklConv2D")
     .Input("mkl_input: uint8")
     .Input("mkl_filter: uint8")
     .Output("output: T")
+    .Output("filter_output: T")
     .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
@@ -2813,7 +2815,9 @@ REGISTER_OP("_MklConv2DWithBias")
     .Input("mkl_filter: uint8")
     .Input("mkl_bias: uint8")
     .Output("output: T")
+    .Output("filter_output: T")
     .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 13356e1d8a6..35c31c6cb81 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -9611,8 +9611,8 @@ op {
     type_list_attr: "Treduce_func_other_arguments"
   }
   input_arg {
-    name: "window_size"
-    type: DT_INT64
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
     name: "handle"
@@ -9627,6 +9627,10 @@ op {
     name: "reduce_func"
     type: "func"
   }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
   attr {
     name: "Tkey_func_other_arguments"
     type: "list(type)"
@@ -9637,6 +9641,11 @@ op {
     type: "list(type)"
     has_minimum: true
   }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -15795,6 +15804,25 @@ op {
   }
   summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor."
 }
+op {
+  name: "SerializeTensor"
+  input_arg {
+    name: "tensor"
+    description: "A Tensor of type `T`."
+    type: "T"
+  }
+  output_arg {
+    name: "serialized"
+    description: "A serialized TensorProto proto of the input tensor."
+    type_attr: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "The type of the input tensor."
+  }
+  summary: "Transforms a Tensor into a serialized TensorProto proto."
+}
 op {
   name: "Placeholder"
   output_arg {
@@ -24677,7 +24705,7 @@ op {
   }
   output_arg {
     name: "backprops"
-    description: "The gradients: `gradients / (1 + abs(-features)) ** 2`."
+    description: "The gradients: `gradients / (1 + abs(features)) ** 2`."
     type_attr: "T"
   }
   attr {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index 2e605fdffcf..1f7ebe91cf0 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -292,6 +292,19 @@ out_type: The type of the serialized tensor.  The provided type must match the
 output: A Tensor of type `out_type`.
 )doc");
 
+REGISTER_OP("SerializeTensor")
+    .Input("tensor: T")
+    .Output("serialized: string")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Transforms a Tensor into a serialized TensorProto proto.
+
+tensor: A Tensor of type `T`.
+T: The type of the input tensor.
+serialized: A serialized TensorProto proto of the input tensor.
+)doc");
+
 REGISTER_OP("DecodeJSONExample")
     .Input("json_examples: string")
     .Output("binary_examples: string")
diff --git a/tensorflow/core/ops/summary_ops.cc b/tensorflow/core/ops/summary_ops.cc
new file mode 100644
index 00000000000..f778b487972
--- /dev/null
+++ b/tensorflow/core/ops/summary_ops.cc
@@ -0,0 +1,218 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("SummaryWriter")
+    .Output("writer: resource")
+    .Attr("shared_name: string = ''")
+    .Attr("container: string = ''")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Returns a handle to be used to access a summary writer.
+
+The summary writer is an in-graph resource which can be used by ops to write
+summaries to event files.
+
+writer: the summary writer resource. Scalar handle.
+)doc");
+
+REGISTER_OP("CreateSummaryFileWriter")
+    .Input("writer: resource")
+    .Input("logdir: string")
+    .Input("max_queue: int32")
+    .Input("flush_millis: int32")
+    .Input("filename_suffix: string")
+    .Doc(R"doc(
+Creates a summary file writer accessible by the given resource handle.
+
+writer: A handle to the summary writer resource
+logdir: Directory where the event file will be written.
+max_queue: Size of the queue of pending events and summaries.
+flush_millis: How often, in milliseconds, to flush the pending events and
+  summaries to disk.
+filename_suffix: Every event file's name is suffixed with this suffix.
+)doc");
+
+REGISTER_OP("FlushSummaryWriter")
+    .Input("writer: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"(
+Flushes the writer's unwritten events.
+
+writer: A handle to the summary writer resource.
+)");
+
+REGISTER_OP("CloseSummaryWriter")
+    .Input("writer: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"(
+Flushes and closes the summary writer.
+
+Also removes it from the resource manager. To reopen, use another
+CreateSummaryFileWriter op.
+
+writer: A handle to the summary writer resource.
+)");
+
+REGISTER_OP("WriteSummary")
+    .Input("writer: resource")
+    .Input("global_step: int64")
+    .Input("tensor: T")
+    .Input("tag: string")
+    .Input("summary_metadata: string")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Outputs a `Summary` protocol buffer with a tensor.
+
+writer: A handle to a summary writer.
+global_step: The step to write the summary for.
+tensor: A tensor to serialize.
+tag: The summary's tag.
+summary_metadata: Serialized SummaryMetadata protocol buffer containing
+ plugin-related metadata for this summary.
+)doc");
+
+REGISTER_OP("WriteScalarSummary")
+    .Input("writer: resource")
+    .Input("global_step: int64")
+    .Input("tag: string")
+    .Input("value: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Writes a `Summary` protocol buffer with scalar values.
+
+The input `tag` and `value` must have the scalars.
+
+writer: A handle to a summary writer.
+global_step: The step to write the summary for.
+tag: Tag for the summary.
+value: Value for the summary.
+)doc");
+
+REGISTER_OP("WriteHistogramSummary")
+    .Input("writer: resource")
+    .Input("global_step: int64")
+    .Input("tag: string")
+    .Input("values: T")
+    .Attr("T: realnumbertype = DT_FLOAT")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Writes a `Summary` protocol buffer with a histogram.
+
+The generated
+[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+has one summary value containing a histogram for `values`.
+
+This op reports an `InvalidArgument` error if any value is not finite.
+
+writer: A handle to a summary writer.
+global_step: The step to write the summary for.
+tag: Scalar.  Tag to use for the `Summary.Value`.
+values: Any shape. Values to use to build the histogram.
+)doc");
+
+REGISTER_OP("WriteImageSummary")
+    .Input("writer: resource")
+    .Input("global_step: int64")
+    .Input("tag: string")
+    .Input("tensor: T")
+    .Input("bad_color: uint8")
+    .Attr("max_images: int >= 1 = 3")
+    .Attr("T: {uint8, float, half} = DT_FLOAT")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Writes a `Summary` protocol buffer with images.
+
+The summary has up to `max_images` summary values containing images. The
+images are built from `tensor` which must be 4-D with shape `[batch_size,
+height, width, channels]` and where `channels` can be:
+
+*  1: `tensor` is interpreted as Grayscale.
+*  3: `tensor` is interpreted as RGB.
+*  4: `tensor` is interpreted as RGBA.
+
+The images have the same number of channels as the input tensor. For float
+input, the values are normalized one image at a time to fit in the range
+`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+normalization algorithms:
+
+*  If the input values are all positive, they are rescaled so the largest one
+   is 255.
+
+*  If any input value is negative, the values are shifted so input value 0.0
+   is at 127.  They are then rescaled so that either the smallest value is 0,
+   or the largest one is 255.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_images` is 1, the summary value tag is '*tag*/image'.
+*  If `max_images` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+
+The `bad_color` argument is the color to use in the generated images for
+non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+Each element must be in the range `[0, 255]` (It represents the value of a
+pixel in the output image).  Non-finite values in the input tensor are
+replaced by this tensor in the output image.  The default value is the color
+red.
+
+writer: A handle to a summary writer.
+global_step: The step to write the summary for.
+tag: Scalar. Used to build the `tag` attribute of the summary values.
+tensor: 4-D of shape `[batch_size, height, width, channels]` where
+  `channels` is 1, 3, or 4.
+max_images: Max number of batch elements to generate images for.
+bad_color: Color to use for pixels with non-finite values.
+)doc");
+
+REGISTER_OP("WriteAudioSummary")
+    .Input("writer: resource")
+    .Input("global_step: int64")
+    .Input("tag: string")
+    .Input("tensor: float")
+    .Input("sample_rate: float")
+    .Attr("max_outputs: int >= 1 = 3")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Writes a `Summary` protocol buffer with audio.
+
+The summary has up to `max_outputs` summary values containing audio. The
+audio is built from `tensor` which must be 3-D with shape `[batch_size,
+frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+*  If `max_outputs` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+
+writer: A handle to a summary writer.
+global_step: The step to write the summary for.
+tag: Scalar. Used to build the `tag` attribute of the summary values.
+tensor: 2-D of shape `[batch_size, frames]`.
+sample_rate: The sample rate of the signal in hertz.
+max_outputs: Max number of batch elements to generate audio for.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 126558cac38..e1ad66c387a 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -75,6 +75,9 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
 def tf_jspb_proto_library(**kwargs):
   pass
 
+def tf_nano_proto_library(**kwargs):
+  pass
+
 def tf_proto_library(name, srcs = [], has_services = None,
                      protodeps = [], visibility = [], testonly = 0,
                      cc_libs = [],
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index ac0988e7047..ebdd4b624aa 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -25,12 +24,8 @@ limitations under the License.
 #endif
 
 #include <stdlib.h>
-#include <string.h>
 #include <time.h>
 
-#include <string>
-#include <unordered_map>
-
 namespace tensorflow {
 namespace internal {
 
@@ -88,11 +83,11 @@ void LogMessage::GenerateLogMessage() {
   const size_t time_buffer_size = 30;
   char time_buffer[time_buffer_size];
   strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S",
-           localtime(&now_seconds));
+	   localtime(&now_seconds));
 
   // TODO(jeff,sanjay): Replace this with something that logs through the env.
   fprintf(stderr, "%s.%06d: %c %s:%d] %s\n", time_buffer, micros_remainder,
-          "IWEF"[severity_], fname_, line_, str().c_str());
+	  "IWEF"[severity_], fname_, line_, str().c_str());
 }
 #endif
 
@@ -129,48 +124,6 @@ int64 MinVLogLevelFromEnv() {
   return LogLevelStrToInt(tf_env_var_val);
 }
 
-using VmoduleMap = std::unordered_map<StringPiece, int, StringPiece::Hasher>;
-
-// Returns a mapping from module name to VLOG level, derived from the
-// TF_CPP_VMOUDLE environment variable; ownership is transferred to the caller.
-VmoduleMap* VmoduleRecordsFromEnv() {
-  // The value of the env var is supposed to be of the form:
-  //    "foo=1,bar=2,baz=3"
-  const char* tf_env_var_val = getenv("TF_CPP_VMODULE");
-  auto* result = new VmoduleMap();
-  if (tf_env_var_val == nullptr) return result;
-  while (true) {
-    const char* eq = strchr(tf_env_var_val, '=');
-    if (eq == nullptr) break;
-    const char* after_eq = eq + 1;
-
-    // Comma either points at the next comma delimiter, or at a null terminator.
-    // We check that the integer we parse ends at this delimiter.
-    const char* comma = strchr(after_eq, ',');
-    const char* new_tf_env_var_val;
-    if (comma == nullptr) {
-      comma = strchr(after_eq, '\0');
-      new_tf_env_var_val = comma;
-    } else {
-      new_tf_env_var_val = comma + 1;
-    }
-
-    char* endptr = nullptr;
-    int level = strtol(after_eq, &endptr, 10);
-    if (endptr != comma) {
-      fprintf(stderr,
-              "warning: could not parse integer in vmodule specification in "
-              "\"%s\".\n",
-              after_eq);
-      break;
-    }
-    StringPiece module(tf_env_var_val, eq - tf_env_var_val);
-    tf_env_var_val = new_tf_env_var_val;
-    (*result)[module] = level;
-  }
-  return result;
-}
-
 }  // namespace
 
 LogMessage::~LogMessage() {
@@ -184,19 +137,6 @@ int64 LogMessage::MinVLogLevel() {
   return min_vlog_level;
 }
 
-bool LogMessage::VmoduleActivated(const char* fname, int lvl) {
-  static VmoduleMap* vmodule_records = VmoduleRecordsFromEnv();
-  const char* last_slash = strrchr(fname, '/');
-  const char* module_start = last_slash == nullptr ? fname : last_slash + 1;
-  const char* dot_after = strchr(module_start, '.');
-  const char* module_limit =
-      dot_after == nullptr ? strchr(fname, '\0') : dot_after;
-  StringPiece module(module_start, module_limit - module_start);
-  auto it = vmodule_records->find(module);
-  if (it == vmodule_records->end()) return false;
-  return it->second >= lvl;
-}
-
 LogMessageFatal::LogMessageFatal(const char* file, int line)
     : LogMessage(file, line, FATAL) {}
 LogMessageFatal::~LogMessageFatal() {
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index c8c9b2da11a..04ff9e12b6f 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -46,16 +46,6 @@ class LogMessage : public std::basic_ostringstream<char> {
   // but VLOG(3) will not. Defaults to 0.
   static int64 MinVLogLevel();
 
-  // Returns whether VLOG level lvl is activated for the file fname.
-  //
-  // E.g. if the environment variable TF_CPP_VMODULE contains foo=3 and fname is
-  // foo.cc and lvl is <= 3, this will return true.
-  //
-  // It is expected that the result of this query will be cached in the VLOG-ing
-  // call site to avoid repeated lookups. This routine performs a hash-map
-  // access against the VLOG-ing specification provided by the env var.
-  static bool VmoduleActivated(const char* fname, int lvl);
-
  protected:
   void GenerateLogMessage();
 
@@ -86,38 +76,18 @@ class LogMessageFatal : public LogMessage {
 
 #define LOG(severity) _TF_LOG_##severity
 
-#if defined(IS_MOBILE_PLATFORM)
-
+#ifdef IS_MOBILE_PLATFORM
 // Turn VLOG off when under mobile devices for considerations of binary size.
-#define _VLOG_IS_ON(lvl, file) ((lvl) <= 0)
-
-#elif defined(PLATFORM_WINDOWS)
-
-// TODO(b/64279502) The _VLOG_IS_ON definition below appears to cause MSVC to
-// fatal error, so we fall back to the vmodule-less implementation for now.
-#define _VLOG_IS_ON(lvl, file) \
-  ((lvl) <= ::tensorflow::internal::LogMessage::MinVLogLevel())
-
+#define VLOG_IS_ON(lvl) ((lvl) <= 0)
 #else
-
-// Otherwise, set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
-// of VLOG, or TF_CPP_VMODULE to set the minimum log level for individual
-// translation units.
-#define _VLOG_IS_ON(lvl, file)                                              \
-  (([](int level, const char* fname) {                                      \
-    if (level <= ::tensorflow::internal::LogMessage::MinVLogLevel())        \
-      return true;                                                          \
-    static bool vmodule_activated =                                         \
-        ::tensorflow::internal::LogMessage::VmoduleActivated(fname, level); \
-    return vmodule_activated;                                               \
-  })(lvl, file))
-
+// Otherwise, Set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
+// of VLOG
+#define VLOG_IS_ON(lvl) \
+  ((lvl) <= ::tensorflow::internal::LogMessage::MinVLogLevel())
 #endif
 
-#define VLOG_IS_ON(lvl) _VLOG_IS_ON(lvl, __FILE__)
-
-#define VLOG(lvl)                                   \
-  if (TF_PREDICT_FALSE(_VLOG_IS_ON(lvl, __FILE__))) \
+#define VLOG(lvl)      \
+  if (TF_PREDICT_FALSE(VLOG_IS_ON(lvl))) \
   ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO)
 
 // CHECK dies with a fatal error if condition is not true.  It is *not*
diff --git a/tensorflow/core/platform/vmodule_benchmark_test.cc b/tensorflow/core/platform/vmodule_benchmark_test.cc
deleted file mode 100644
index 0f9e75bf9cd..00000000000
--- a/tensorflow/core/platform/vmodule_benchmark_test.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-
-static void BM_DisabledVlog(int iters) {
-  for (int i = 0; i < iters; ++i) {
-    VLOG(1) << "Testing VLOG(1)!";
-  }
-}
-BENCHMARK(BM_DisabledVlog);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/vmodule_test.cc b/tensorflow/core/platform/vmodule_test.cc
deleted file mode 100644
index 47b4b2e0e78..00000000000
--- a/tensorflow/core/platform/vmodule_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Test that popens a child process with the VLOG-ing environment variable set
-// for the logging framework, and observes VLOG_IS_ON and VLOG macro output.
-
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/platform.h"
-#include "tensorflow/core/platform/test.h"
-
-#include <string.h>
-
-namespace tensorflow {
-namespace {
-
-int RealMain(const char* argv0, bool do_vlog) {
-  if (do_vlog) {
-#if !defined(PLATFORM_GOOGLE)
-    // Note, we only test this when !defined(PLATFORM_GOOGLE) because
-    // VmoduleActivated doesn't exist in that implementation.
-    //
-    // Also, we call this internal API to simulate what would happen if
-    // differently-named translation units attempted to VLOG, so we don't need
-    // to create dummy translation unit files.
-    bool ok = internal::LogMessage::VmoduleActivated("vmodule_test.cc", 7) &&
-              internal::LogMessage::VmoduleActivated("shoobadooba.h", 3);
-    if (!ok) {
-      fprintf(stderr, "vmodule activated levels not as expected.\n");
-      return EXIT_FAILURE;
-    }
-#endif
-
-    // Print info on which VLOG levels are activated.
-    fprintf(stderr, "VLOG_IS_ON(8)? %d\n", VLOG_IS_ON(8));
-    fprintf(stderr, "VLOG_IS_ON(7)? %d\n", VLOG_IS_ON(7));
-    fprintf(stderr, "VLOG_IS_ON(6)? %d\n", VLOG_IS_ON(6));
-    // Do some VLOG-ing.
-    VLOG(8) << "VLOG(8)";
-    VLOG(7) << "VLOG(7)";
-    VLOG(6) << "VLOG(6)";
-    LOG(INFO) << "INFO";
-    return EXIT_SUCCESS;
-  }
-
-  // Popen the child process.
-  std::string command = std::string(argv0);
-#if defined(PLATFORM_GOOGLE)
-  command = command + " do_vlog --vmodule=vmodule_test=7 --alsologtostderr";
-#else
-  command =
-      "TF_CPP_VMODULE=vmodule_test=7,shoobadooba=3 " + command + " do_vlog";
-#endif
-  command += " 2>&1";
-  fprintf(stderr, "Running: \"%s\"\n", command.c_str());
-  FILE* f = popen(command.c_str(), "r");
-  if (f == nullptr) {
-    fprintf(stderr, "Failed to popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Read data from the child's stdout.
-  constexpr int kBufferSizeBytes = 4096;
-  char buffer[kBufferSizeBytes];
-  size_t result = fread(buffer, sizeof(buffer[0]), kBufferSizeBytes - 1, f);
-  if (result == 0) {
-    fprintf(stderr, "Failed to read from child stdout: %zu %s\n", result,
-            strerror(errno));
-    return EXIT_FAILURE;
-  }
-  buffer[result] = '\0';
-  int status = pclose(f);
-  if (status == -1) {
-    fprintf(stderr, "Failed to close popen child: %s\n", strerror(errno));
-    return EXIT_FAILURE;
-  }
-
-  // Check output is as expected.
-  const char kExpected[] =
-      "VLOG_IS_ON(8)? 0\nVLOG_IS_ON(7)? 1\nVLOG_IS_ON(6)? 1\n";
-  if (strstr(buffer, kExpected) == nullptr) {
-    fprintf(stderr, "error: unexpected output from child: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-  bool ok = strstr(buffer, "VLOG(7)\n") != nullptr &&
-            strstr(buffer, "VLOG(6)\n") != nullptr &&
-            strstr(buffer, "VLOG(8)\n") == nullptr;
-  if (!ok) {
-    fprintf(stderr, "error: VLOG output not as expected: \"%.*s\"\n",
-            kBufferSizeBytes, buffer);
-    return EXIT_FAILURE;
-  }
-
-  // Success!
-  return EXIT_SUCCESS;
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  bool do_vlog = argc >= 2 && strcmp(argv[1], "do_vlog") == 0;
-  return tensorflow::RealMain(argv[0], do_vlog);
-}
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 5c50a86c88f..f0d4dafd3ea 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -56,7 +56,7 @@ with tf.contrib.tfprof.ProfileContext() as pctx:
 
 ```shell
 # Profiling from Python API is not interactive.
-# Dump the profiles to files and profile with interactive command line.
+# Dump the profiles to files and profile with interactive command line or web UI.
 with tf.contrib.tfprof.ProfileContext() as pctx:
   pctx.add_auto_profile_dump('/tmp/profiles', [100])
   train_loop()
@@ -66,7 +66,15 @@ bazel-bin/tensorflow/core/profiler/profiler \
     --run_meta_path=/tmp/profiles/run_meta \
     --op_log_path=/tmp/profiles/tfprof_log \
 tfprof> op -select micros,bytes,occurrence -order_by micros
+
+
+# To be open sourced...
+bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \
+    --graph_path=/tmp/profiles/graph.pbtxt \
+    --run_meta_path=/tmp/profiles/run_meta \
+    --op_log_path=/tmp/profiles/tfprof_log \
 ```
+![ProfilerUI](g3doc/profiler_ui.jpg)
 
 <b>Detail Tutorials</b>
 
@@ -239,5 +247,6 @@ bug fix. `OpLogProto` is a good plus if it is used.
 #### Teams
 
 * Xin Pan (xpan@google.com, github: panyx0718)
+* Chris Antaki
 * Yao Zhang
 * Jon Shlens
diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md
index d87b0d8603d..d0de8317f69 100644
--- a/tensorflow/core/profiler/g3doc/advise.md
+++ b/tensorflow/core/profiler/g3doc/advise.md
@@ -86,7 +86,7 @@ For example:
 *   Checks RecvTensor RPC latency and bandwidth.
 *   Checks CPU/Memory utilization of the job.
 
-####AcceleratorUtilization Checker
+#### AcceleratorUtilization Checker
 * Checks what percentage of time the accelerator spends on computation.
 
 #### OperationChecker
@@ -100,7 +100,7 @@ For example:
 *   Checks the most expensive graph nodes.
 *   Checks the most expensive graph-building Python codes.
 
-####Contribute Your Checker
+#### Contribute Your Checker
 
 Follow examples of accelerator_utilization_checker.h
 
diff --git a/tensorflow/core/profiler/g3doc/command_line.md b/tensorflow/core/profiler/g3doc/command_line.md
index 857b5e64590..fb4207c7841 100644
--- a/tensorflow/core/profiler/g3doc/command_line.md
+++ b/tensorflow/core/profiler/g3doc/command_line.md
@@ -51,13 +51,13 @@ It defines _checkpoint_variable op type. It also provides checkpointed tensors'
 Note: this feature is not well maintained now.
 
 
-###Start `tfprof`
+### Start `tfprof`
 
 #### Build `tfprof`
 
 ```shell
 # Build the tool.
-bazel build --config opt third_party/tensorflow/core/profiler/...
+bazel build --config opt tensorflow/core/profiler:profiler
 
 # Help information, including detail 'option' instructions.
 bazel-bin/tensorflow/core/profiler/profiler help
@@ -140,9 +140,9 @@ tfprof>
 -output
 ```
 
-###Examples
+### Examples
 
-####Profile Python Time
+#### Profile Python Time
 ```shell
 # Requires --graph_path --op_log_path
 tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
index 15712d04c25..ddee63ad42a 100644
--- a/tensorflow/core/profiler/g3doc/options.md
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -1,6 +1,6 @@
-##Options
+## Options
 
-###Overview
+### Overview
 
 For all tfprof views, the profiles are processed with the following procedures
 
@@ -35,7 +35,7 @@ For all tfprof views, the profiles are processed with the following procedures
 4) Finally, the filtered data structure is output in a format depending
    on the `-output` option.
 
-####Option Semantics In Different View
+#### Option Semantics In Different View
 options usually have the same semantics in different views. However, some
 can vary. For example `-max_depth` in scope view means the depth of
 name scope <b>tree</b>. In op view, it means the length of operation <b>list</b>.
@@ -68,7 +68,7 @@ output_bytes: The memory output by the operation. It's not necessarily requested
               by the current operation. For example, it can be a tensor
               forwarded from input to output, with in-place mutation.
 
-###Docs
+### Docs
 
 `-max_depth`: Show nodes that are at most this number of hops from starting node in the data structure.
 
diff --git a/tensorflow/core/profiler/g3doc/profile_memory.md b/tensorflow/core/profiler/g3doc/profile_memory.md
index a00683d0626..6eda5abdd97 100644
--- a/tensorflow/core/profiler/g3doc/profile_memory.md
+++ b/tensorflow/core/profiler/g3doc/profile_memory.md
@@ -1,4 +1,4 @@
-##Profile Memory
+## Profile Memory
 
 It is generally a good idea to visualize the memory usage in timeline.
 It allows you to see the memory consumption of each GPU over time.
diff --git a/tensorflow/core/profiler/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
index a42b2e918da..61bb66bd21b 100644
--- a/tensorflow/core/profiler/g3doc/profile_model_architecture.md
+++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
@@ -1,9 +1,9 @@
-##Profile Model Architecture
+## Profile Model Architecture
 
 * [Profile Model Parameters](#profile-model-parameters)
 * [Profile Model Float Operations](#profile-model-float-operations)
 
-###Profile Model Parameters
+### Profile Model Parameters
 
 <b>Notes:</b>
 `VariableV2` operation type might contain variables created by TensorFlow
@@ -39,9 +39,9 @@ param_stats = tf.profiler.profile(
 sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```
 
-###Profile Model Float Operations
+### Profile Model Float Operations
 
-####Caveats
+#### Caveats
 
 For an operation to have float operation statistics:
 
diff --git a/tensorflow/core/profiler/g3doc/profile_time.md b/tensorflow/core/profiler/g3doc/profile_time.md
index e11a75553b2..4aafc697a9b 100644
--- a/tensorflow/core/profiler/g3doc/profile_time.md
+++ b/tensorflow/core/profiler/g3doc/profile_time.md
@@ -1,4 +1,4 @@
-##Profile Time
+## Profile Time
 
 * [Times in TensorFlow and tfprof](#times-in-tensorflow-and-tfprof)
 * [Profile by Python Code](#profile-by-python-code)
@@ -7,7 +7,7 @@
 * [Profile by Name Scope](#profile-by-name-scope)
 
 
-###Times in TensorFlow and tfprof
+### Times in TensorFlow and tfprof
 When we run a model, Tensorflow schedules and runs the nodes (operations)
 in the graph. An operation can be placed on an accelerator or on CPU.
 
@@ -37,7 +37,7 @@ When an operation is placed on CPU, it will completely run on CPU. Hence,
 should be 0.
 
 
-###Profile by Python Code
+### Profile by Python Code
 ```python
 # In code view, the time of each line of Python code is the aggregated
 # times of all operations created by that line.
@@ -112,7 +112,7 @@ Set ```-output timeline:outfile=<filename>``` to generate timeline instead of st
 </left>
 
 
-###Profile by Operation Type
+### Profile by Operation Type
 ```python
 # In op view, you can view the aggregated time of each operation type.
 tfprof> op -select micros,occurrence -order_by micros
@@ -138,7 +138,7 @@ MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/
 ```
 
 
-###Profile by Graph
+### Profile by Graph
 
 Usually, use graph view to generate a timeline to visualize the result.
 
@@ -163,7 +163,7 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 ******************************************************
 ```
 
-###Profile by Name Scope
+### Profile by Name Scope
 
 Usually scope view allows you to pin point the problematic places if you
 have properly named your operations with tf.name_scope or tf.variable_scope.
diff --git a/tensorflow/core/profiler/g3doc/profiler_ui.jpg b/tensorflow/core/profiler/g3doc/profiler_ui.jpg
new file mode 100644
index 00000000000..36aa94502a8
Binary files /dev/null and b/tensorflow/core/profiler/g3doc/profiler_ui.jpg differ
diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h
new file mode 100644
index 00000000000..f6375b25157
--- /dev/null
+++ b/tensorflow/core/util/permutation_input_iterator.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename InputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationInputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationInputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;   ///< The type of a pointer to an element the
+                                ///< iterator can point to
+  typedef ValueType reference;  ///< The type of a reference to an element the
+                                ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  InputIteratorT input_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationInputIterator(
+      InputIteratorT input_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)  ///< Conversion functor to wrap
+      : input_itr(input_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return input_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(input_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(input_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return input_itr[index_itr[n]];
+  }
+
+  /// Structure dereference
+  __host__ __device__ __forceinline__ pointer operator->() {
+    return input_itr + *index_itr;
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && input_itr == rhs.input_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h
index eeb31295737..5932d59a159 100644
--- a/tensorflow/core/util/tensor_slice_reader.h
+++ b/tensorflow/core/util/tensor_slice_reader.h
@@ -165,13 +165,18 @@ bool TensorSliceReader::CopySliceData(const string& name,
     CHECK_GE(idx, 0) << "Failed to find the index for filename " << fname;
     // We read a record in the corresponding sstable
     const string key = EncodeTensorNameSlice(name, slice_s);
-    CHECK(sss_[idx]->Get(key, &value))
-        << "Failed to seek to the record for tensor " << name << ", slice "
-        << slice_s.DebugString() << ": computed key = " << key;
+    if (!sss_[idx]->Get(key, &value)) {
+      VLOG(1) << "Failed to seek to the record for tensor " << name
+              << ", slice " << slice_s.DebugString()
+              << ": computed key = " << key;
+      return false;
+    }
     SavedTensorSlices sts;
-    CHECK(ParseProtoUnlimited(&sts, value))
-        << "Failed to parse the record for tensor " << name << ", slice "
-        << slice_s.DebugString() << ": computed key = " << key;
+    if (!ParseProtoUnlimited(&sts, value)) {
+      VLOG(1) << "Failed to parse the record for tensor " << name << ", slice "
+              << slice_s.DebugString() << ": computed key = " << key;
+      return false;
+    }
     CopyDataFromTensorSliceToTensorSlice(
         tss->shape(), slice_s, slice,
         checkpoint::TensorProtoData<T>(sts.data().data()), data);
diff --git a/tensorflow/core/util/transform_output_iterator.h b/tensorflow/core/util/transform_output_iterator.h
new file mode 100644
index 00000000000..1640791ad17
--- /dev/null
+++ b/tensorflow/core/util/transform_output_iterator.h
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename StoreType, typename InputType, typename ConversionOp,
+          typename OffsetT = ptrdiff_t>
+class TransformOutputIterator {
+ private:
+  // Proxy object
+  struct Reference {
+    StoreType* ptr;
+    ConversionOp conversion_op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ Reference(StoreType* ptr,
+                                                  ConversionOp conversion_op)
+        : ptr(ptr), conversion_op(conversion_op) {}
+
+    /// Assignment
+    __host__ __device__ __forceinline__ InputType operator=(InputType val) {
+      *ptr = conversion_op(val);
+      return val;
+    }
+  };
+
+ public:
+  // Required iterator traits
+  typedef TransformOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;            ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef void
+      value_type;        ///< The type of the element the iterator can point to
+  typedef void pointer;  ///< The type of a pointer to an element the iterator
+                         ///< can point to
+  typedef Reference reference;  ///< The type of a reference to an element the
+                                ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+  /*private:*/
+
+  StoreType* ptr;
+  ConversionOp conversion_op;
+
+ public:
+  /// Constructor
+  template <typename QualifiedStoreType>
+  __host__ __device__ __forceinline__ TransformOutputIterator(
+      QualifiedStoreType* ptr,
+      ConversionOp conversionOp)  ///< Native pointer to wrap
+      : ptr(ptr), conversion_op(conversionOp) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    ptr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    ptr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return Reference(ptr, conversion_op);
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(ptr + n, conversion_op);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    ptr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(ptr - n, conversion_op);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    ptr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return ptr - other.ptr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return Reference(ptr + n, conversion_op);
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (ptr == rhs.ptr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return (ptr != rhs.ptr);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index 194649a304d..4991783a53a 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -37,6 +37,7 @@ Asia:
 * [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
 * [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
 * [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
 
 
 Europe:
diff --git a/tensorflow/docs_src/get_started/estimator.md b/tensorflow/docs_src/get_started/estimator.md
index a55454f8af3..4f3a438d17d 100644
--- a/tensorflow/docs_src/get_started/estimator.md
+++ b/tensorflow/docs_src/get_started/estimator.md
@@ -273,9 +273,7 @@ Then, the code creates a `DNNClassifier` model using the following arguments:
     containing 10, 20, and 10 neurons, respectively.
 *   `n_classes=3`. Three target classes, representing the three Iris species.
 *   `model_dir=/tmp/iris_model`. The directory in which TensorFlow will save
-    checkpoint data during model training. For more on logging and monitoring
-    with TensorFlow, see
-    @{$monitors$Logging and Monitoring Basics with tf.estimator}.
+    checkpoint data and TensorBoard summaries during model training.
 
 ## Describe the training input pipeline {#train-input}
 
@@ -315,9 +313,7 @@ classifier.train(input_fn=train_input_fn, steps=1000)
 
 However, if you're looking to track the model while it trains, you'll likely
 want to instead use a TensorFlow @{tf.train.SessionRunHook$`SessionRunHook`}
-to perform logging operations. See the tutorial
-@{$monitors$Logging and Monitoring Basics with tf.estimator}
-for more on this topic.
+to perform logging operations.
 
 ## Evaluate Model Accuracy {#evaluate-accuracy}
 
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 3e700daa304..003fac1a287 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -24,8 +24,6 @@ To learn about the high-level API, read the following guides:
     API.
   * @{$get_started/input_fn$Building Input Functions},
     which takes you into a somewhat more sophisticated use of this API.
-  * @{$get_started/monitors$Logging and Monitoring Basics with tf.contrib.learn},
-    which explains how to audit the progress of model training.
 
 TensorBoard is a utility to visualize different aspects of machine learning.
 The following guides explain how to use TensorBoard:
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
index 422f45c586a..7706c07b1d9 100644
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -249,7 +249,7 @@ here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/input_fn/bos
 
 ### Importing the Housing Data
 
-To start, set up your imports (including `pandas` and `tensorflow`) and @{$monitors#enabling-logging-with-tensorflow$set logging verbosity} to
+To start, set up your imports (including `pandas` and `tensorflow`) and set logging verbosity to
 `INFO` for more detailed log output:
 
 ```python
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index b656033f7e8..bb67eaddda3 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -5,7 +5,6 @@ mnist/pros.md
 mnist/mechanics.md
 estimator.md
 input_fn.md
-monitors.md
 summaries_and_tensorboard.md
 graph_viz.md
 tensorboard_histograms.md
diff --git a/tensorflow/docs_src/get_started/monitors.md b/tensorflow/docs_src/get_started/monitors.md
deleted file mode 100644
index 5606e953658..00000000000
--- a/tensorflow/docs_src/get_started/monitors.md
+++ /dev/null
@@ -1,406 +0,0 @@
-# Logging and Monitoring Basics with tf.contrib.learn
-
-When training a model, it’s often valuable to track and evaluate progress in
-real time. In this tutorial, you’ll learn how to use TensorFlow’s logging
-capabilities and the `Monitor` API to audit the in-progress training of a neural
-network classifier for categorizing irises. This tutorial builds on the code
-developed in @{$estimator$tf.estimator Quickstart} so if you
-haven't yet completed that tutorial, you may want to explore it first,
-especially if you're looking for an intro/refresher on tf.contrib.learn basics.
-
-## Setup {#setup}
-
-For this tutorial, you'll be building upon the following code from
-@{$estimator$tf.estimator Quickstart}:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-import tensorflow as tf
-
-# Data sets
-IRIS_TRAINING = os.path.join(os.path.dirname(__file__), "iris_training.csv")
-IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
-
-def main(unused_argv):
-    # Load datasets.
-    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
-    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
-
-    # Specify that all features have real-value data
-    feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-
-    # Build 3 layer DNN with 10, 20, 10 units respectively.
-    classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
-                                                hidden_units=[10, 20, 10],
-                                                n_classes=3,
-                                                model_dir="/tmp/iris_model")
-
-    # Fit model.
-    classifier.fit(x=training_set.data,
-                   y=training_set.target,
-                   steps=2000)
-
-    # Evaluate accuracy.
-    accuracy_score = classifier.evaluate(x=test_set.data,
-                                         y=test_set.target)["accuracy"]
-    print('Accuracy: {0:f}'.format(accuracy_score))
-
-    # Classify two new flower samples.
-    new_samples = np.array(
-        [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
-    y = list(classifier.predict(new_samples, as_iterable=True))
-    print('Predictions: {}'.format(str(y)))
-
-if __name__ == "__main__":
-  tf.app.run()
-```
-
-Copy the above code into a file, and download the corresponding
-[training](http://download.tensorflow.org/data/iris_training.csv) and
-[test](http://download.tensorflow.org/data/iris_test.csv) data sets to the same
-directory.
-
-In the following sections, you'll progressively make updates to the above code
-to add logging and monitoring capabilities. Final code incorporating all updates
-is [available for download
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/monitors/iris_monitors.py).
-
-## Overview
-
-The @{$estimator$tf.estimator Quickstart tutorial} walked through
-how to implement a neural net classifier to categorize iris examples into one of
-three species.
-
-But when [the code](#setup) from this tutorial is run, the output contains no
-logging tracking how model training is progressing&mdash;only the results of the
-`print` statements that were included:
-
-```none
-Accuracy: 0.933333
-Predictions: [1 2]
-```
-
-Without any logging, model training feels like a bit of a black box; you can't
-see what's happening as TensorFlow steps through gradient descent, get a sense
-of whether the model is converging appropriately, or audit to determine whether
-[early stopping](https://en.wikipedia.org/wiki/Early_stopping) might be
-appropriate.
-
-One way to address this problem would be to split model training into multiple
-`fit` calls with smaller numbers of steps in order to evaluate accuracy more
-progressively. However, this is not recommended practice, as it greatly slows
-down model training. Fortunately, tf.contrib.learn offers another solution: a
-@{tf.contrib.learn.monitors$Monitor API} designed to help
-you log metrics and evaluate your model while training is in progress. In the
-following sections, you'll learn how to enable logging in TensorFlow, set up a
-ValidationMonitor to do streaming evaluations, and visualize your metrics using
-TensorBoard.
-
-## Enabling Logging with TensorFlow
-
-TensorFlow uses five different levels for log messages. In order of ascending
-severity, they are `DEBUG`, `INFO`, `WARN`, `ERROR`, and `FATAL`. When you
-configure logging at any of these levels, TensorFlow will output all log
-messages corresponding to that level and all levels of higher severity. For
-example, if you set a logging level of `ERROR`, you'll get log output containing
-`ERROR` and `FATAL` messages, and if you set a level of `DEBUG`, you'll get log
-messages from all five levels.
-
-By default, TensorFlow is configured at a logging level of `WARN`, but when
-tracking model training, you'll want to adjust the level to `INFO`, which will
-provide additional feedback as `fit` operations are in progress.
-
-Add the following line to the beginning of your code (right after your
-`import`s):
-
-```python
-tf.logging.set_verbosity(tf.logging.INFO)
-```
-
-Now when you run the code, you'll see additional log output like the following:
-
-```none
-INFO:tensorflow:loss = 1.18812, step = 1
-INFO:tensorflow:loss = 0.210323, step = 101
-INFO:tensorflow:loss = 0.109025, step = 201
-```
-
-With `INFO`-level logging, tf.contrib.learn automatically outputs [training-loss
-metrics](https://en.wikipedia.org/wiki/Loss_function) to stderr after every 100
-steps.
-
-## Configuring a ValidationMonitor for Streaming Evaluation
-
-Logging training loss is helpful to get a sense whether your model is
-converging, but what if you want further insight into what's happening during
-training? tf.contrib.learn provides several high-level `Monitor`s you can attach
-to your `fit` operations to further track metrics and/or debug lower-level
-TensorFlow operations during model training, including:
-
-Monitor             | Description
-------------------- | -----------
-`CaptureVariable`   | Saves a specified variable's values into a collection at every _n_ steps of training
-`PrintTensor`       | Logs a specified tensor's values at every _n_ steps of training
-`SummarySaver`      | Saves @{tf.Summary} [protocol buffers](https://developers.google.com/protocol-buffers/) for a given tensor using a @{tf.summary.FileWriter} at every _n_ steps of training
-`ValidationMonitor` | Logs a specified set of evaluation metrics at every _n_ steps of training, and, if desired, implements early stopping under certain conditions
-
-### Evaluating Every *N* Steps
-
-For the iris neural network classifier, while logging training loss, you might
-also want to simultaneously evaluate against test data to see how well the model
-is generalizing. You can accomplish this by configuring a `ValidationMonitor`
-with the test data (`test_set.data` and `test_set.target`), and setting how
-often to evaluate with `every_n_steps`. The default value of `every_n_steps` is
-`100`; here, set `every_n_steps` to `50` to evaluate after every 50 steps of
-model training:
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50)
-```
-
-Place this code right before the line instantiating the `classifier`.
-
-`ValidationMonitor`s rely on saved checkpoints to perform evaluation operations,
-so you'll want to modify instantiation of the `classifier` to add a
-@{tf.contrib.learn.RunConfig} that includes
-`save_checkpoints_secs`, which specifies how many seconds should elapse between
-checkpoint saves during training. Because the iris data set is quite small, and
-thus trains quickly, it makes sense to set `save_checkpoints_secs` to 1 (saving
-a checkpoint every second) to ensure a sufficient number of checkpoints:
-
-```python
-classifier = tf.contrib.learn.DNNClassifier(
-    feature_columns=feature_columns,
-    hidden_units=[10, 20, 10],
-    n_classes=3,
-    model_dir="/tmp/iris_model",
-    config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
-```
-
-NOTE: The `model_dir` parameter specifies an explicit directory
-(`/tmp/iris_model`) for model data to be stored; this directory path will be
-easier to reference later on than an autogenerated one. Each time you run the
-code, any existing data in `/tmp/iris_model` will be loaded, and model training
-will continue where it left off in the last run (e.g., running the script twice
-in succession will execute 4000 steps during training&mdash;2000 during each
-`fit` operation). To start over model training from scratch, delete
-`/tmp/iris_model` before running the code.
-
-Finally, to attach your `validation_monitor`, update the `fit` call to include a
-`monitors` param, which takes a list of all monitors to run during model
-training:
-
-```python
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=2000,
-               monitors=[validation_monitor])
-```
-
-Now, when you rerun the code, you should see validation metrics in your log
-output, e.g.:
-
-```none
-INFO:tensorflow:Validation (step 50): loss = 1.71139, global_step = 0, accuracy = 0.266667
-...
-INFO:tensorflow:Validation (step 300): loss = 0.0714158, global_step = 268, accuracy = 0.966667
-...
-INFO:tensorflow:Validation (step 1750): loss = 0.0574449, global_step = 1729, accuracy = 0.966667
-```
-
-### Customizing the Evaluation Metrics with MetricSpec
-
-By default, if no evaluation metrics are specified, `ValidationMonitor` will log
-both [loss](https://en.wikipedia.org/wiki/Loss_function) and accuracy, but you
-can customize the list of metrics that will be run every 50 steps. To specify
-the exact metrics you'd like to run in each evaluation pass, you can add a
-`metrics` param to the `ValidationMonitor` constructor. `metrics` takes a dict
-of key/value pairs, where each key is the name you'd like logged for the metric,
-and the corresponding value is a
-[`MetricSpec`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/metric_spec.py)
-object.
-
-The `MetricSpec` constructor accepts four parameters:
-
-*   `metric_fn`. The function that calculates and returns the value of a metric.
-    This can be a predefined function available in the
-    @{tf.contrib.metrics} module, such as
-    @{tf.contrib.metrics.streaming_precision} or
-    @{tf.contrib.metrics.streaming_recall}.
-
-    Alternatively, you can define your own custom metric function, which must
-    take `predictions` and `labels` tensors as arguments (a `weights` argument
-    can also optionally be supplied). The function must return the value of the
-    metric in one of two formats:
-
-    *   A single tensor
-    *   A pair of ops `(value_op, update_op)`, where `value_op` returns the
-        metric value and `update_op` performs a corresponding operation to
-        update internal model state.
-
-*   `prediction_key`. The key of the tensor containing the predictions returned
-    by the model. This argument may be omitted if the model returns either a
-    single tensor or a dict with a single entry. For a `DNNClassifier` model,
-    class predictions will be returned in a tensor with the key
-    @{tf.contrib.learn.PredictionKey.CLASSES}.
-
-*   `label_key`. The key of the tensor containing the labels returned by the
-    model, as specified by the model's @{$input_fn$`input_fn`}. As
-    with `prediction_key`, this argument may be omitted if the `input_fn`
-    returns either a single tensor or a dict with a single entry. In the iris
-    example in this tutorial, the `DNNClassifier` does not have an `input_fn`
-    (`x`,`y` data is passed directly to `fit`), so it's not necessary to provide
-    a `label_key`.
-
-*   `weights_key`. *Optional*. The key of the tensor (returned by the
-    @{$input_fn$`input_fn`}) containing weights inputs for the
-    `metric_fn`.
-
-The following code creates a `validation_metrics` dict that defines three
-metrics to log during model evaluation:
-
-*   `"accuracy"`, using @{tf.contrib.metrics.streaming_accuracy}
-    as the `metric_fn`
-*   `"precision"`, using @{tf.contrib.metrics.streaming_precision}
-    as the `metric_fn`
-*   `"recall"`, using @{tf.contrib.metrics.streaming_recall}
-    as the `metric_fn`
-
-```python
-validation_metrics = {
-    "accuracy":
-        tf.contrib.learn.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_accuracy,
-            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
-    "precision":
-        tf.contrib.learn.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_precision,
-            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
-    "recall":
-        tf.contrib.learn.MetricSpec(
-            metric_fn=tf.contrib.metrics.streaming_recall,
-            prediction_key=tf.contrib.learn.PredictionKey.CLASSES)
-}
-```
-
-Add the above code before the `ValidationMonitor` constructor. Then revise the
-`ValidationMonitor` constructor as follows to add a `metrics` parameter to log
-the accuracy, precision, and recall metrics specified in `validation_metrics`
-(loss is always logged, and doesn't need to be explicitly specified):
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50,
-    metrics=validation_metrics)
-```
-
-Rerun the code, and you should see precision and recall included in your log
-output, e.g.:
-
-```none
-INFO:tensorflow:Validation (step 50): recall = 0.0, loss = 1.20626, global_step = 1, precision = 0.0, accuracy = 0.266667
-...
-INFO:tensorflow:Validation (step 600): recall = 1.0, loss = 0.0530696, global_step = 571, precision = 1.0, accuracy = 0.966667
-...
-INFO:tensorflow:Validation (step 1500): recall = 1.0, loss = 0.0617403, global_step = 1452, precision = 1.0, accuracy = 0.966667
-```
-
-### Early Stopping with ValidationMonitor
-
-Note that in the above log output, by step 600, the model has already achieved
-precision and recall rates of 1.0. This raises the question as to whether model
-training could benefit from
-[early stopping](https://en.wikipedia.org/wiki/Early_stopping).
-
-In addition to logging eval metrics, `ValidationMonitor`s make it easy to
-implement early stopping when specified conditions are met, via three params:
-
-| Param                            | Description                               |
-| -------------------------------- | ----------------------------------------- |
-| `early_stopping_metric`          | Metric that triggers early stopping       |
-:                                  : (e.g., loss or accuracy) under conditions :
-:                                  : specified in `early_stopping_rounds` and  :
-:                                  : `early_stopping_metric_minimize`. Default :
-:                                  : is `"loss"`.                              :
-| `early_stopping_metric_minimize` | `True` if desired model behavior is to    |
-:                                  : minimize the value of                     :
-:                                  : `early_stopping_metric`; `False` if       :
-:                                  : desired model behavior is to maximize the :
-:                                  : value of `early_stopping_metric`. Default :
-:                                  : is `True`.                                :
-| `early_stopping_rounds`          | Sets a number of steps during which if    |
-:                                  : the `early_stopping_metric` does not      :
-:                                  : decrease (if                              :
-:                                  : `early_stopping_metric_minimize` is       :
-:                                  : `True`) or increase (if                   :
-:                                  : `early_stopping_metric_minimize` is       :
-:                                  : `False`), training will be stopped.       :
-:                                  : Default is `None`, which means early      :
-:                                  : stopping will never occur.                :
-
-Make the following revision to the `ValidationMonitor` constructor, which
-specifies that if loss (`early_stopping_metric="loss"`) does not decrease
-(`early_stopping_metric_minimize=True`) over a period of 200 steps
-(`early_stopping_rounds=200`), model training will stop immediately at that
-point, and not complete the full 2000 steps specified in `fit`:
-
-```python
-validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-    test_set.data,
-    test_set.target,
-    every_n_steps=50,
-    metrics=validation_metrics,
-    early_stopping_metric="loss",
-    early_stopping_metric_minimize=True,
-    early_stopping_rounds=200)
-```
-
-Rerun the code to see if model training stops early:
-
-```none
-...
-INFO:tensorflow:Validation (step 1150): recall = 1.0, loss = 0.056436, global_step = 1119, precision = 1.0, accuracy = 0.966667
-INFO:tensorflow:Stopping. Best step: 800 with loss = 0.048313818872.
-```
-
-Indeed, here training stops at step 1150, indicating that for the past 200
-steps, loss did not decrease, and that overall, step 800 produced the smallest
-loss value against the test data set. This suggests that additional calibration
-of hyperparameters by decreasing the step count might further improve the model.
-
-## Visualizing Log Data with TensorBoard
-
-Reading through the log produced by `ValidationMonitor` provides plenty of raw
-data on model performance during training, but it may also be helpful to see
-visualizations of this data to get further insight into trends&mdash;for
-example, how accuracy is changing over step count. You can use TensorBoard (a
-separate program packaged with TensorFlow) to plot graphs like this by setting
-the `logdir` command-line argument to the directory where you saved your model
-training data (here, `/tmp/iris_model`). Run the following on your command line:
-
-<pre><strong>$ tensorboard --logdir=/tmp/iris_model/</strong>
-Starting TensorBoard 39 on port 6006</pre>
-
-Then navigate to `http://0.0.0.0:`*`<port_number>`* in your browser, where
-*`<port_number>`* is the port specified in the command-line output (here,
-`6006`).
-
-If you click on the accuracy field, you'll see an image like the following,
-which shows accuracy plotted against step count:
-
-![Accuracy over step count in TensorBoard](https://www.tensorflow.org/images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
-
-For more on using TensorBoard, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning} and @{$graph_viz$TensorBoard: Graph Visualization}.
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 43e09906f73..d5e481520c4 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -151,10 +151,10 @@ Take the following steps to install TensorFlow with Virtualenv:
      (tensorflow)$ <b>pip install --upgrade tensorflow-gpu</b>  # for Python 2.7 and GPU
      (tensorflow)$ <b>pip3 install --upgrade tensorflow-gpu</b> # for Python 3.n and GPU</pre>
 
-     If the preceding command succeeds, skip Step 5. If the preceding
-     command fails, perform Step 5.
+     If the preceding command succeeds, skip Step 6. If the preceding
+     command fails, perform Step 6.
 
-  5. (Optional) If Step 4 failed (typically because you invoked a pip version
+  6. (Optional) If Step 5 failed (typically because you invoked a pip version
      lower than 8.1), install TensorFlow in the active virtualenv environment
      by issuing a command of the following format:
 
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index bf3cb5bf196..ba26bd5e941 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -1,4 +1,4 @@
-# Using the `Dataset` API for TensorFlow Input Pipelines
+# Importing Data
 
 The `Dataset` API enables you to build complex input pipelines from
 simple, reusable pieces. For example, the pipeline for an image model might
@@ -735,7 +735,7 @@ def dataset_input_fn():
 
     return {"image_data": image, "date_time": parsed["date_time"]}, label
 
-  # Use `Dataset.map()` to build a pair of a feature dictionary and a label 
+  # Use `Dataset.map()` to build a pair of a feature dictionary and a label
   # tensor for each example.
   dataset = dataset.map(parser)
   dataset = dataset.shuffle(buffer_size=10000)
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index a5724ea294e..755bb049c99 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -134,7 +134,7 @@ The heart of every Estimator--whether pre-made or custom--is its
 evaluation, and prediction. When you are using a pre-made Estimator,
 someone else has already implemented the model function. When relying
 on a custom Estimator, you must write the model function yourself. A
-${$extend/estimators$companion document)
+@{$extend/estimators$companion document}
 explains how to write the model function.
 
 
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 22fe229422a..eef35d6dcc7 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -4,6 +4,8 @@ The documents in this unit dive into the details of writing TensorFlow
 code.  For TensorFlow 1.3, we revised this document extensively.
 The units are now as follows:
 
+  * @{$programmers_guide/estimators$Estimators}, which introduces a high-level
+    TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/tensors$Tensors}, which explains how to create,
     manipulate, and access Tensors--the fundamental object in TensorFlow.
   * @{$programmers_guide/variables$Variables}, which details how
@@ -18,8 +20,6 @@ The units are now as follows:
     such as Estimators or Keras, the high-level API creates and manages
     graphs and sessions for you, but understanding graphs and sessions
     can still be helpful.
-  * @{$programmers_guide/estimators$Estimators}, which introduces a high-level
-    TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/saved_model$Saving and Restoring}, which
     explains how to save and restore variables and models.
   * @{$programmers_guide/datasets$Input Pipelines}, which explains how to
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 5082e7f36c8..0c42f119c95 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,8 +1,8 @@
 index.md
+estimators.md
 tensors.md
 variables.md
 graphs.md
-estimators.md
 saved_model.md
 datasets.md
 threading_and_queues.md
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 0781347fd6e..dda707aea26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -209,6 +209,95 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
+// Writes a `Summary` protocol buffer with scalar values.
+//
+// The input `tag` and `value` must have the scalars.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	global_step: The step to write the summary for.
+//	tag: Tag for the summary.
+//	value: Value for the summary.
+//
+// Returns the created operation.
+func WriteScalarSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteScalarSummary",
+		Input: []tf.Input{
+			writer, global_step, tag, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	global_step: The step to write the summary for.
+//	tensor: A tensor to serialize.
+//	tag: The summary's tag.
+//	summary_metadata: Serialized SummaryMetadata protocol buffer containing
+// plugin-related metadata for this summary.
+//
+// Returns the created operation.
+func WriteSummary(scope *Scope, writer tf.Output, global_step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteSummary",
+		Input: []tf.Input{
+			writer, global_step, tensor, tag, summary_metadata,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Flushes and closes the summary writer.
+//
+// Also removes it from the resource manager. To reopen, use another
+// CreateSummaryFileWriter op.
+//
+// Arguments:
+//	writer: A handle to the summary writer resource.
+//
+// Returns the created operation.
+func CloseSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CloseSummaryWriter",
+		Input: []tf.Input{
+			writer,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Flushes the writer's unwritten events.
+//
+// Arguments:
+//	writer: A handle to the summary writer resource.
+//
+// Returns the created operation.
+func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FlushSummaryWriter",
+		Input: []tf.Input{
+			writer,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
 type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
@@ -2149,6 +2238,34 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset
 	return offset
 }
 
+// Writes a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	global_step: The step to write the summary for.
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns the created operation.
+func WriteHistogramSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteHistogramSummary",
+		Input: []tf.Input{
+			writer, global_step, tag, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -7087,6 +7204,48 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
+// SummaryWriterAttr is an optional argument to SummaryWriter.
+type SummaryWriterAttr func(optionalAttr)
+
+// SummaryWriterSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func SummaryWriterSharedName(value string) SummaryWriterAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// SummaryWriterContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func SummaryWriterContainer(value string) SummaryWriterAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// Returns a handle to be used to access a summary writer.
+//
+// The summary writer is an in-graph resource which can be used by ops to write
+// summaries to event files.
+//
+// Returns the summary writer resource. Scalar handle.
+func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SummaryWriter",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the set of files matching one or more glob patterns.
 //
 // Note that this routine only supports wildcard characters in the
@@ -10570,6 +10729,61 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
+// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary.
+type WriteAudioSummaryAttr func(optionalAttr)
+
+// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Writes a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	global_step: The step to write the summary for.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns the created operation.
+func WriteAudioSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteAudioSummary",
+		Input: []tf.Input{
+			writer, global_step, tag, tensor, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
 type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
@@ -15797,6 +16011,30 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64
 	return op.Output(0)
 }
 
+// Creates a summary file writer accessible by the given resource handle.
+//
+// Arguments:
+//	writer: A handle to the summary writer resource
+//	logdir: Directory where the event file will be written.
+//	max_queue: Size of the queue of pending events and summaries.
+//	flush_millis: How often, in milliseconds, to flush the pending events and
+// summaries to disk.
+//	filename_suffix: Every event file's name is suffixed with this suffix.
+//
+// Returns the created operation.
+func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CreateSummaryFileWriter",
+		Input: []tf.Input{
+			writer, logdir, max_queue, flush_millis, filename_suffix,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // EncodeBase64Attr is an optional argument to EncodeBase64.
 type EncodeBase64Attr func(optionalAttr)
 
@@ -17172,6 +17410,84 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 	return op.Output(0)
 }
 
+// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
+type WriteImageSummaryAttr func(optionalAttr)
+
+// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// Writes a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	global_step: The step to write the summary for.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//	bad_color: Color to use for pixels with non-finite values.
+//
+// Returns the created operation.
+func WriteImageSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteImageSummary",
+		Input: []tf.Input{
+			writer, global_step, tag, tensor, bad_color,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Pads a tensor with zeros.
 //
 // This operation pads a `input` with zeros according to the `paddings` you
@@ -20306,7 +20622,7 @@ func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 //	gradients: The backpropagated gradients to the corresponding softsign operation.
 //	features: The features passed as input to the corresponding softsign operation.
 //
-// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
 func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 64b37677357..ee07fc48132 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -5,7 +5,9 @@ package(default_visibility = ["//visibility:private"])
 
 licenses(["notice"])  # Apache 2.0
 
-load("build_defs", "JAVACOPTS")
+load(":build_defs.bzl", "JAVACOPTS")
+load(":src/gen/gen_ops.bzl", "tf_java_op_gen_srcjar")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 java_library(
     name = "tensorflow",
@@ -34,12 +36,58 @@ filegroup(
 
 filegroup(
     name = "java_op_sources",
-    srcs = glob(["src/main/java/org/tensorflow/op/**/*.java"]),
+    srcs = glob(["src/main/java/org/tensorflow/op/**/*.java"]) + [
+        ":java_op_gen_sources",
+    ],
     visibility = [
         "//tensorflow/java:__pkg__",
     ],
 )
 
+tf_java_op_gen_srcjar(
+    name = "java_op_gen_sources",
+    gen_base_package = "org.tensorflow.op",
+    gen_tool = "java_op_gen_tool",
+    ops_libs = [
+        "array_ops",
+        "candidate_sampling_ops",
+        "control_flow_ops",
+        "data_flow_ops",
+        "image_ops",
+        "io_ops",
+        "linalg_ops",
+        "logging_ops",
+        "math_ops",
+        "nn_ops",
+        "no_op",
+        "parsing_ops",
+        "random_ops",
+        "sparse_ops",
+        "state_ops",
+        "string_ops",
+        "training_ops",
+        "user_ops",
+    ],
+)
+
+# Build the gen tool as a library, as it will be linked to a core/ops binary
+# file before making it an executable. See tf_java_op_gen_srcjar().
+cc_library(
+    name = "java_op_gen_tool",
+    srcs = glob([
+        "src/gen/cc/*.h",
+        "src/gen/cc/*.cc",
+    ]),
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 java_library(
     name = "testutil",
     testonly = 1,
diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md
index 17bb799961d..62277753618 100644
--- a/tensorflow/java/maven/README.md
+++ b/tensorflow/java/maven/README.md
@@ -1,11 +1,13 @@
 # TensorFlow for Java using Maven
 
-The [TensorFlow Java
-API](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
-is available through artifacts uploaded to [Maven
-Central](https://oss.sonatype.org/content/repositories/snapshots/org/tensorflow/).
-This document describes the process of updating the release artifacts. It does
-_not_ describe how to use the artifacts, for which the reader is referred to the
+The
+[TensorFlow Java API](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
+is available on Maven Central and JCenter through artifacts uploaded to
+[OSS Sonatype](https://oss.sonatype.org/content/repositories/releases/org/tensorflow/) and
+[Bintray](https://bintray.com/google/tensorflow/tensorflow) respectively. This
+document describes the process of updating the release artifacts. It does _not_
+describe how to use the artifacts, for which the reader is referred to
+the
 [TensorFlow for Java installation instructions](https://www.tensorflow.org/code/tensorflow/java/README.md).
 
 ## Background
@@ -20,7 +22,7 @@ Hence, the process for building and uploading release artifacts is not a single
 
 ## Artifact Structure
 
-There are five artifacts and thus `pom.xml`s involved in this release:
+There are six artifacts and thus `pom.xml`s involved in this release:
 
 1.  `tensorflow`: The single dependency for projects requiring TensorFlow for
     Java. This convenience package depends on the two below, and is the one that
@@ -37,16 +39,22 @@ There are five artifacts and thus `pom.xml`s involved in this release:
 4.  `proto`: Generated Java code for TensorFlow protocol buffers
     (e.g., `MetaGraphDef`, `ConfigProto` etc.)
 
-5.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
+5. `tensorflow-android`: A package geared towards
+    supporting [TensorFlow on Android](../../contrib/android/README.md), and is
+    a self-contained Android AAR library containing all necessary native and
+    Java code.
+
+6.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
     shared by all of the above.
 
+
 ## Updating the release
 
-The TensorFlow artifacts at Maven Central are created from files built as part
-of the TensorFlow release process (which uses `bazel`). The author's lack of
-familiarity with Maven best practices combined with the use of a different build
-system means that this process is possibly not ideal, but it's what we've got.
-Suggestions are welcome.
+The Maven artifacts are created from files built as part of the TensorFlow
+release process (which uses `bazel`). The author's lack of familiarity with
+Maven best practices combined with the use of a different build system means
+that this process is possibly not ideal, but it's what we've got.  Suggestions
+are welcome.
 
 In order to isolate the environment used for building, all release processes are
 conducted in a [Docker](https://www.docker.com) container.
@@ -59,16 +67,28 @@ conducted in a [Docker](https://www.docker.com) container.
     account does not have permissions, then you'll need to ask someone who does
     to [file a ticket](https://issues.sonatype.org/) to add to the permissions
     ([sample ticket](https://issues.sonatype.org/browse/MVNCENTRAL-1637)).
--   A GPG signing key, required [to sign the release artifacts](http://central.sonatype.org/pages/apache-maven.html#gpg-signed-components).
+-   An account at [bintray.com](https://bintray.com) that has permissions to
+    update the [tensorflow repository](https://bintray.com/google/tensorflow).
+    If your account does not have permissions, then you'll need to ask one of
+    the [organization administrators](https://bintray.com/google) to give you
+    permissions to update the `tensorflow` repository. Please keep the
+    [repository option](https://bintray.com/google/tensorflow/edit?tab=general)
+    to *"GPG sign uploaded files using Bintray's public/private key pair"*
+    **unchecked**, otherwise it will conflict with locally signed artifacts.
+-   A GPG signing key, required
+    [to sign the release artifacts](http://central.sonatype.org/pages/apache-maven.html#gpg-signed-components).
 
-### Deploying to Maven Central
+### Deploying to Sonatype and Bintray
 
-1.  Create a file with your OSSRH credentials (or perhaps you use `mvn` and have
-    it in `~/.m2/settings.xml`):
+1.  Create a file with your OSSRH credentials and
+    [Bintray API key](https://bintray.com/docs/usermanual/interacting/interacting_interacting.html#anchorAPIKEY)
+    (or perhaps you use `mvn` and have it in `~/.m2/settings.xml`):
 
     ```sh
     SONATYPE_USERNAME="your_sonatype.org_username_here"
     SONATYPE_PASSWORD="your_sonatype.org_password_here"
+    BINTRAY_USERNAME="your_bintray_username_here"
+    BINTRAY_API_KEY="your_bintray_api_key_here"
     GPG_PASSPHRASE="your_gpg_passphrase_here"
     cat >/tmp/settings.xml <<EOF
     <settings>
@@ -78,19 +98,16 @@ conducted in a [Docker](https://www.docker.com) container.
           <username>${SONATYPE_USERNAME}</username>
           <password>${SONATYPE_PASSWORD}</password>
         </server>
+        <server>
+          <id>bintray</id>
+          <username>${BINTRAY_USERNAME}</username>
+          <password>${BINTRAY_API_KEY}</password>
+        </server>
       </servers>
-      <profiles>
-        <profile>
-          <id>ossrh</id>
-          <activation>
-            <activeByDefault>true</activeByDefault>
-          </activation>
-          <properties>
-            <gpg.executable>gpg2</gpg.executable>
-            <gpg.passphrase>${GPG_PASSPHRASE}</gpg.passphrase>
-          </properties>
-        </profile>
-      </profiles>
+      <properties>
+        <gpg.executable>gpg2</gpg.executable>
+        <gpg.passphrase>${GPG_PASSPHRASE}</gpg.passphrase>
+      </properties>
     </settings>
     EOF
     ```
@@ -98,30 +115,83 @@ conducted in a [Docker](https://www.docker.com) container.
 2.  Run the `release.sh` script.
 
 3.  If the script above succeeds then the artifacts would have been uploaded to
-    the private staging repository. After verifying the release, visit
-    https://oss.sonatype.org/#stagingRepositories, find the `org.tensorflow`
-    release and click on either `Release` to finalize the release, or `Drop` to
-    abort. Some things of note:
+    the private staging repository in Sonatype, and as unpublished artifacts in
+    Bintray. After verifying the release, you should finalize or abort the
+    release on both sites.
 
+4.  Visit https://oss.sonatype.org/#stagingRepositories, find the `org.tensorflow`
+    release and click on either `Release` to finalize the release, or `Drop` to
+    abort.
+
+5.  Visit https://bintray.com/google/tensorflow/tensorflow, and select the
+    version you just uploaded. Notice there's a message about unpublished
+    artifacts. Click on either `Publish` to finalize the release, or `Discard`
+    to abort.
+
+6.  Some things of note:
     - For details, look at the [Sonatype guide](http://central.sonatype.org/pages/releasing-the-deployment.html).
     - Syncing with [Maven Central](http://repo1.maven.org/maven2/org/tensorflow/)
       can take 10 minutes to 2 hours (as per the [OSSRH
       guide](http://central.sonatype.org/pages/ossrh-guide.html#releasing-to-central)).
+    - For Bintray details, refer to their guide on
+      [managing uploaded content](https://bintray.com/docs/usermanual/uploads/uploads_managinguploadedcontent.html#_publishing).
 
-4.  Upon successful release, commit changes to all the `pom.xml` files
+7.  Upon successful release, commit changes to all the `pom.xml` files
     (which should have the updated version number).
 
 ### Snapshots
 
 If the `TF_VERSION` provided to the `release.sh` script ends in `-SNAPSHOT`,
 then instead of using official release files, the nightly build artifacts from
-https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/ and
-https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/ will
-be used to upload to the Maven Central snapshots repository.
+https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/,
+https://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/ and
+https://ci.tensorflow.org/view/Nightly/job/nightly-android
+will be used to upload to the Maven Central snapshots repository. (Note that
+snapshots are only uploaded to Maven Central, not Bintray.)
 
+### Skip deploying to a repository
+
+Should you need, setting environment variables `DEPLOY_OSSRH=0` or
+`DEPLOY_BINTRAY=0` when calling `release.sh` will skip deploying to OSSRH or
+Bintray respectively. Note that snapshots are only uploaded to OSSRH, so you
+cannot skip deploying to OSSRH for a `-SNAPSHOT` version.
+
+## The overall flow
+
+This section provides some pointers around how artifacts are currently
+assembled.
+
+All native and java code is first built and tested on
+a [Tensorflow Jenkins server](https://ci.tensorflow.org/) which run various
+scripts under the [`tools/ci_build`](../../tools/ci_build/) directory. Of
+particular interest may be `tools/ci_build/builds/libtensorflow.sh` which
+bundles Java-related build sources and outputs into archives, and
+`tools/ci_build/builds/android_full.sh` which produces an Android AAR package.
+
+Maven artifacts however are not created in Jenkins. Instead, artifacts are
+created and deployed externally on-demand, when a maintainer runs the
+`release.sh` script.
+
+This script spins up a Docker instance which downloads the archives created by
+successful runs of various `tools/ci_build` scripts on the Tensorflow Jenkins
+server.
+
+It organizes these archives locally into a maven-friendly layout, and runs `mvn
+deploy` to create maven artifacts within the container. Native libraries built
+in Jenkins are used as-is, but srcjars for java code are used to compile class
+files and generate javadocs.) It also downloads the Android AAR from the Jenkins
+server and directly deploys it via `mvn gpg:sign-and-deploy-file`.
+
+`release.sh` then stages these artifacts to OSSRH and Bintray, and if all goes
+well a maintainer can log into both sites to promote them as a new release.
+
+There is a small change to the flow for a standard (rather than a `-SNAPSHOT`)
+release. Rather than downloading archives directly from jobs on the Jenkins
+server, the script uses a static repository of QA-blessed archives.
 
 ## References
 
 -   [Sonatype guide](http://central.sonatype.org/pages/ossrh-guide.html) for
     hosting releases.
 -   [Ticket that created the `org/tensorflow` configuration](https://issues.sonatype.org/browse/OSSRH-28072) on OSSRH.
+-   The [Bintray User Manual](https://bintray.com/docs/usermanual/index.html)
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index cc4fbc4a750..0a3552d7563 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -33,18 +33,35 @@
     <module>proto</module>
   </modules>
 
-
-  <!-- Requirements from http://central.sonatype.org/pages/apache-maven.html -->
-  <distributionManagement>
-    <snapshotRepository>
+  <!-- Two profiles are used:
+       ossrh - deploys to ossrh/maven central
+       bintray - deploys to bintray/jcenter. -->
+  <profiles>
+    <profile>
       <id>ossrh</id>
-      <url>https://oss.sonatype.org/content/repositories/snapshots</url>
-    </snapshotRepository>
-    <repository>
-      <id>ossrh</id>
-      <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
-    </repository>
-  </distributionManagement>
+      <distributionManagement>
+        <!-- Sonatype requirements from http://central.sonatype.org/pages/apache-maven.html -->
+        <snapshotRepository>
+          <id>ossrh</id>
+          <url>https://oss.sonatype.org/content/repositories/snapshots</url>
+        </snapshotRepository>
+        <repository>
+          <id>ossrh</id>
+          <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+        </repository>
+      </distributionManagement>
+    </profile>
+    <profile>
+      <id>bintray</id>
+      <distributionManagement>
+        <!-- https://blog.bintray.com/2015/09/17/publishing-your-maven-project-to-bintray/ -->
+        <repository>
+          <id>bintray</id>
+          <url>https://api.bintray.com/maven/google/tensorflow/tensorflow/;publish=0</url>
+        </repository>
+      </distributionManagement>
+    </profile>
+  </profiles>
   <!-- http://central.sonatype.org/pages/requirements.html#developer-information -->
   <developers>
     <developer>
@@ -55,19 +72,6 @@
   </developers>
   <build>
     <plugins>
-      <!-- Staging repository configuration: http://central.sonatype.org/pages/apache-maven.html#nexus-staging-maven-plugin-for-deployment-and-release -->
-      <plugin>
-        <groupId>org.sonatype.plugins</groupId>
-        <artifactId>nexus-staging-maven-plugin</artifactId>
-        <version>1.6.7</version>
-        <extensions>true</extensions>
-        <configuration>
-           <serverId>ossrh</serverId>
-           <nexusUrl>https://oss.sonatype.org/</nexusUrl>
-
-           <autoReleaseAfterClose>false</autoReleaseAfterClose>
-        </configuration>
-      </plugin>
       <!-- GPG signed components: http://central.sonatype.org/pages/apache-maven.html#gpg-signed-components -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh
index b95a4d4674e..9012ea14ea6 100755
--- a/tensorflow/java/maven/release.sh
+++ b/tensorflow/java/maven/release.sh
@@ -49,6 +49,8 @@ fi
 set -ex
 docker run \
   -e TF_VERSION="${TF_VERSION}" \
+  -e DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}" \
+  -e DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}" \
   -v ${PWD}:/tensorflow \
   -v "${SETTINGS_XML}":/root/.m2/settings.xml \
   -v ${HOME}/.gnupg:/root/.gnupg \
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index 6b4d5d70327..a2ce0971954 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -19,11 +19,23 @@
 
 
 RELEASE_URL_PREFIX="https://storage.googleapis.com/tensorflow/libtensorflow"
+
+# By default we deploy to both ossrh and bintray. These two
+# environment variables can be set to skip either repository.
+DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}"
+DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}"
+
 IS_SNAPSHOT="false"
 if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then
   IS_SNAPSHOT="true"
+  # Bintray does not allow snapshots.
+  DEPLOY_BINTRAY="false"
 fi
 PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip"
+if [[ "${DEPLOY_BINTRAY}" != "true" && "${DEPLOY_OSSRH}" != "true" ]]; then
+  echo "Must deploy to at least one of Bintray or OSSRH" >&2
+  exit 2
+fi
 
 set -ex
 
@@ -39,6 +51,20 @@ update_version_in_pom() {
   mvn versions:set -DnewVersion="${TF_VERSION}"
 }
 
+# Fetch a property from pom files for a given profile.
+# Arguments:
+#   profile - name of the selected profile.
+#   property - name of the property to be retrieved.
+# Output:
+#   Echo property value to stdout
+mvn_property() {
+  local profile="$1"
+  local prop="$2"
+  mvn -q --non-recursive exec:exec -P "${profile}" \
+    -Dexec.executable='echo' \
+    -Dexec.args="\${${prop}}"
+}
+
 download_libtensorflow() {
   if [[ "${IS_SNAPSHOT}" == "true" ]]; then
     URL="http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=cpu-slave/lastSuccessfulBuild/artifact/lib_package/libtensorflow-src.jar"
@@ -137,29 +163,50 @@ generate_java_protos() {
   rm -rf "${DIR}/proto/tmp"
 }
 
+# Deploy artifacts using a specific profile.
+# Arguments:
+#   profile - name of selected profile.
+# Outputs:
+#   n/a
+deploy_profile() {
+  local profile="$1"
+  # Deploy the non-android pieces.
+  mvn deploy -P"${profile}"
+  # Determine the correct pom file property to use
+  # for the repository url.
+  local rtype
+  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
+    rtype='snapshotRepository'
+  else
+    rtype='repository'
+  fi
+  local url=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.url")
+  local repositoryId=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.id")
+  mvn gpg:sign-and-deploy-file \
+    -Dfile="${DIR}/tensorflow-android/target/tensorflow.aar" \
+    -DpomFile="${DIR}/tensorflow-android/target/pom-android.xml" \
+    -Durl="${url}" \
+    -DrepositoryId="${repositoryId}"
+}
+
 # If successfully built, try to deploy.
 # If successfully deployed, clean.
 # If deployment fails, debug with
 #   ./release.sh ${TF_VERSION} ${SETTINGS_XML} bash
 # To get a shell to poke around the maven artifacts with.
 deploy_artifacts() {
-  # This deploys the non-android pieces
-  mvn deploy
-
-  # Sign and deploy the previously downloaded aar file as a single
-  # maven artifact.
-  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
-    REPO="https://oss.sonatype.org/content/repositories/snapshots"
-  else
-    REPO="https://oss.sonatype.org/service/local/staging/deploy/maven2/"
+  # Deploy artifacts to ossrh if requested.
+  if [[ "${DEPLOY_OSSRH}" == "true" ]]; then
+    deploy_profile 'ossrh'
+  fi
+  # Deploy artifacts to bintray if requested.
+  if [[ "${DEPLOY_BINTRAY}" == "true" ]]; then
+    deploy_profile 'bintray'
   fi
-  mvn gpg:sign-and-deploy-file -Dfile="${DIR}/tensorflow-android/target/tensorflow.aar" -DpomFile="${DIR}/tensorflow-android/target/pom-android.xml" -Durl=${REPO} -DrepositoryId=ossrh
-
   # Clean up when everything works
   clean
 }
 
-
 if [ -z "${TF_VERSION}" ]
 then
   echo "Must set the TF_VERSION environment variable"
@@ -189,8 +236,14 @@ set +ex
 if [[ "${IS_SNAPSHOT}" == "false" ]]; then
   echo "Uploaded to the staging repository"
   echo "After validating the release: "
-  echo "1. Login to https://oss.sonatype.org/#stagingRepositories"
-  echo "2. Find the 'org.tensorflow' staging release and click either 'Release' to release or 'Drop' to abort"
+  if [[ "${DEPLOY_OSSRH}" == "true" ]]; then
+    echo "* Login to https://oss.sonatype.org/#stagingRepositories"
+    echo "* Find the 'org.tensorflow' staging release and click either 'Release' to release or 'Drop' to abort"
+  fi
+  if [[ "${DEPLOY_BINTRAY}" == "true" ]]; then
+    echo "* Login to https://bintray.com/google/tensorflow/tensorflow"
+    echo "* Either 'Publish' unpublished items to release, or 'Discard' to abort"
+  fi
 else
   echo "Uploaded to the snapshot repository"
 fi
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
new file mode 100644
index 00000000000..bc698124bf9
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/java/src/gen/cc/op_generator.h"
+
+namespace tensorflow {
+namespace op_gen {
+
+const char kUsageHeader[] =
+    "\n\nGenerator of operation wrappers in Java.\n\n"
+    "This executable generates wrappers for all registered operations it has "
+    "been compiled with. A wrapper exposes an intuitive and strongly-typed\n"
+    "interface for building its underlying operation and linking it into a "
+    "graph.\n\n"
+    "Operation wrappers are generated under the path specified by the "
+    "'--output_dir' argument. This path can be absolute or relative to the\n"
+    "current working directory and will be created if it does not exists.\n\n"
+    "The '--lib_name' argument is used to classify the set of operations. If "
+    "the chosen name contains more than one word, it must be provided in \n"
+    "snake_case. This value is declined into other meaningful names, such as "
+    "the group and package of the generated operations. For example,\n"
+    "'--lib_name=my_lib' generates the operations under the "
+    "'org.tensorflow.op.mylib' package and add them to the 'myLib()' operator\n"
+    "group.\n\n"
+    "Note that the operator group assigned to the generated wrappers is just "
+    "an annotation tag at this stage. Operations will not be available through\n"
+    "the 'org.tensorflow.op.Ops' API as a group until the generated classes "
+    "are compiled using an appropriate annotation processor.\n\n"
+    "Finally, the '--base_package' overrides the default parent package "
+    "under which the generated subpackage and classes are to be located.\n\n";
+
+}  // namespace op_gen
+}  // namespace tensorflow
+
+int main(int argc, char* argv[]) {
+  tensorflow::string lib_name;
+  tensorflow::string output_dir;
+  tensorflow::string base_package = "org.tensorflow.op";
+  std::vector<tensorflow::Flag> flag_list = {
+    tensorflow::Flag("output_dir", &output_dir,
+        "Root directory into which output files are generated"),
+    tensorflow::Flag("lib_name", &lib_name,
+        "A name, in snake_case, used to classify this set of operations"),
+    tensorflow::Flag("base_package", &base_package,
+        "Package parent to the generated subpackage and classes")
+  };
+  tensorflow::string usage = tensorflow::op_gen::kUsageHeader;
+  usage += tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+  QCHECK(parsed_flags_ok && !lib_name.empty() && !output_dir.empty()) << usage;
+
+  tensorflow::OpGenerator generator;
+  tensorflow::OpList ops;
+  tensorflow::OpRegistry::Global()->Export(true, &ops);
+  tensorflow::Status status =
+      generator.Run(ops, lib_name, base_package, output_dir);
+  TF_QCHECK_OK(status);
+
+  return 0;
+}
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
new file mode 100644
index 00000000000..814a08c6ccf
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/java/src/gen/cc/op_generator.h"
+
+namespace tensorflow {
+namespace {
+
+string CamelCase(const string& str, char delimiter, bool upper) {
+  string result;
+  bool cap = upper;
+  for (string::const_iterator it = str.begin(); it != str.end(); ++it) {
+    const char c = *it;
+    if (c == delimiter) {
+      cap = true;
+    } else if (cap) {
+      result += toupper(c);
+      cap = false;
+    } else {
+      result += c;
+    }
+  }
+  return result;
+}
+
+}  // namespace
+
+OpGenerator::OpGenerator()
+  : env(Env::Default()) {
+}
+
+OpGenerator::~OpGenerator() {}
+
+Status OpGenerator::Run(const OpList& ops, const string& lib_name,
+    const string& base_package, const string& output_dir) {
+  const string package =
+      base_package + '.' + str_util::StringReplace(lib_name, "_", "", true);
+  const string package_path =
+      output_dir + '/' + str_util::StringReplace(package, ".", "/", true);
+  const string group = CamelCase(lib_name, '_', false);
+
+  if (!env->FileExists(package_path).ok()) {
+    TF_CHECK_OK(env->RecursivelyCreateDir(package_path));
+  }
+
+  LOG(INFO) << "Generating Java wrappers for '" << lib_name << "' operations";
+  // TODO(karllessard) generate wrappers from list of ops
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/op_generator.h b/tensorflow/java/src/gen/cc/op_generator.h
new file mode 100644
index 00000000000..98a1f8d5346
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/op_generator.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_
+#define TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+/// \brief A generator of Java operation wrappers.
+///
+/// Such generator is normally ran only once per executable, outputting
+/// wrappers for the all registered operations it has been compiled with.
+/// Nonetheless, it is designed to support multiple runs, giving a different
+/// list of operations on each cycle.
+class OpGenerator {
+ public:
+  OpGenerator();
+  virtual ~OpGenerator();
+
+  /// \brief Generates wrappers for the given list of 'ops'.
+  ///
+  /// Output files are generated in <output_dir>/<base_package>/<lib_package>,
+  /// where 'lib_package' is derived from 'lib_name'.
+  Status Run(const OpList& ops, const string& lib_name,
+      const string& base_package, const string& output_dir);
+
+ private:
+  Env* env;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_
diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl
new file mode 100644
index 00000000000..e0d5556122b
--- /dev/null
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@@ -0,0 +1,59 @@
+# -*- Python -*-
+
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+
+# Given a list of "ops_libs" (a list of files in the core/ops directory
+# without their .cc extensions), generate Java wrapper code for all operations
+# found in the ops files.
+# Then, combine all those source files into a single archive (.srcjar).
+#
+# For example:
+#  tf_java_op_gen_srcjar("gen_sources", "gen_tool", "my.package", [ "array_ops", "math_ops" ])
+#
+# will create a genrule named "gen_sources" that first generate source files:
+#     ops/src/main/java/my/package/array/*.java
+#     ops/src/main/java/my/package/math/*.java
+#
+# and then archive those source files in:
+#     ops/gen_sources.srcjar
+#
+def tf_java_op_gen_srcjar(name,
+                          gen_tool,
+                          gen_base_package,
+                          ops_libs=[],
+                          ops_libs_pkg="//tensorflow/core",
+                          out_dir="ops/",
+                          out_src_dir="src/main/java/",
+                          visibility=["//tensorflow/java:__pkg__"]):
+
+  gen_tools = []
+  gen_cmds = ["rm -rf $(@D)"] # Always start from fresh when generating source files
+
+  # Construct an op generator binary for each ops library.
+  for ops_lib in ops_libs:
+    gen_lib = ops_lib[:ops_lib.rfind('_')]
+    out_gen_tool = out_dir + ops_lib + "_gen_tool"
+
+    native.cc_binary(
+        name=out_gen_tool,
+        copts=tf_copts(),
+        linkopts=["-lm"],
+        linkstatic=1,  # Faster to link this one-time-use binary dynamically
+        deps=[gen_tool, ops_libs_pkg + ":" + ops_lib + "_op_lib"])
+
+    gen_tools += [":" + out_gen_tool]
+    gen_cmds += ["$(location :" + out_gen_tool + ")" +
+                 " --output_dir=$(@D)/" + out_src_dir +
+                 " --lib_name=" + gen_lib +
+                 " --base_package=" + gen_base_package]
+
+  # Generate a source archive containing generated code for these ops.
+  gen_srcjar = out_dir + name + ".srcjar"
+  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) ."]
+
+  native.genrule(
+      name=name,
+      srcs=["@local_jdk//:jar"] + ["@local_jdk//:jdk"],
+      outs=[gen_srcjar],
+      tools=gen_tools,
+      cmd='&&'.join(gen_cmds))
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 6597889fbcb..26e0f86c37b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1766,6 +1766,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":array_ops_gen",
+        ":dtypes",
         ":framework_ops",
         ":resource_variable_ops_gen",
         ":tensor_shape",
@@ -1775,7 +1777,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:custom_gradient",
         "//tensorflow/python/eager:tape",
-        "//tensorflow/python/eager:tensor",
+        "//tensorflow/python/eager:tensor_node",
     ],
 )
 
@@ -2347,7 +2349,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "gradients_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops/gradients_test.py"],
     additional_deps = [
         ":array_grad",
@@ -2550,6 +2552,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":checkpoint_ops_gen",
         ":client",
         ":control_flow_ops",
         ":data_flow_ops",
@@ -3573,6 +3576,28 @@ py_test(
     ],
 )
 
+py_test(
+    name = "checkpoint_ops_test",
+    size = "small",
+    srcs = ["training/checkpoint_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":checkpoint_ops_gen",
+        ":client",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":io_ops",
+        ":partitioned_variables",
+        ":platform",
+        ":pywrap_tensorflow",
+        ":state_ops",
+        ":training",
+        ":variable_scope",
+        ":variables",
+    ],
+)
+
 py_test(
     name = "monitored_session_test",
     size = "small",
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 08dd3922dbe..fa49e66e87b 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -373,6 +373,33 @@ def TF_Reset(target, containers=None, config=None):
     TF_DeleteSessionOptions(opts)
 %}
 
+// We use TF_GraphToFunction_wrapper instead of TF_GraphToFunction
+%ignore TF_GraphToFunction;
+// TF_GraphToFunction_wrapper does not use any Python methods and
+// does not require GIL to be held.
+%unignore TF_GraphToFunction_wrapper;
+
+// $input is a Python list of wrapped TF_Operations
+%typemap(in) (const std::vector<TF_Operation*>* opers)
+    (std::vector<TF_Operation*> opers) {
+  if ($input != Py_None) {
+    if (!PyList_Check($input)) {
+      SWIG_exception_fail(SWIG_TypeError, "$symname: expected list");
+    }
+    size_t size = PyList_Size($input);
+    for (int i = 0; i < size; ++i) {
+      PyObject* item = PyList_GetItem($input, i);
+      TF_Operation* oper_ptr;
+      SWIG_ConvertPtr(item, reinterpret_cast<void**>(&oper_ptr),
+                      $descriptor(TF_Operation*), 0);
+      opers.push_back(oper_ptr);
+    }
+    $1 = &opers;
+  } else {
+    $1 = nullptr;
+  }
+}
+
 %include "tensorflow/python/client/tf_session_helper.h"
 
 %unignoreall
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 60a589fa8bb..72f560fa878 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -337,4 +337,38 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+TF_Function* TF_GraphToFunction_wrapper(const TF_Graph* fn_body,
+                                        const char* fn_name,
+                                        const std::vector<TF_Operation*>* opers,
+                                        const std::vector<TF_Output>& inputs,
+                                        const std::vector<TF_Output>& outputs,
+                                        const NameVector& output_names,
+                                        const TF_FunctionOptions* opts,
+                                        TF_Status* out_status) {
+  if (!output_names.empty() && output_names.size() != outputs.size()) {
+    Set_TF_Status_from_Status(
+        out_status,
+        errors::InvalidArgument(
+            "output names must be either empty or equal in size to outputs. ",
+            "output names size = ", output_names.size(),
+            " outputs size = ", outputs.size()));
+    return nullptr;
+  }
+
+  int nopers = -1;
+  const TF_Operation* const* opers_array = nullptr;
+  if (opers != nullptr) {
+    nopers = opers->size();
+    opers_array = opers->data();
+  }
+
+  const char** output_names_ptr =
+      output_names.empty() ? nullptr
+                           : const_cast<const char**>(output_names.data());
+
+  return TF_GraphToFunction(fn_body, fn_name, nopers, opers_array,
+                            inputs.size(), inputs.data(), outputs.size(),
+                            outputs.data(), output_names_ptr, opts, out_status);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 3bc63f822fe..8fae6206c07 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -148,6 +148,16 @@ void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// `opers` equaling NULL are converted to `nopers = -1`.
+// `output_names` must be empty or have the same length as `outputs`.
+TF_Function* TF_GraphToFunction_wrapper(const TF_Graph* fn_body,
+                                        const char* fn_name,
+                                        const std::vector<TF_Operation*>* opers,
+                                        const std::vector<TF_Output>& inputs,
+                                        const std::vector<TF_Output>& outputs,
+                                        const NameVector& output_names,
+                                        const TF_FunctionOptions* opts,
+                                        TF_Status* out_status);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index ca3ad1a2c33..326f56ebf9b 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -169,10 +169,6 @@ def _record_gradient(op_name, inputs, attrs, results, name):
 execute.record_gradient = _record_gradient
 
 
-def _ones(shape, dtype):
-  return array_ops.fill(shape, tensor.Tensor(1, dtype=dtype))
-
-
 def _aggregate_grads(gradients):
   """Aggregate gradients of the same tensor."""
   grad_lists = dict()
@@ -225,7 +221,7 @@ def implicit_val_and_grad(f):
                        (end_node.progenitors, repr(start_node)))
     output_gradients = kwds.get("output_gradients", None)
     if output_gradients is None:
-      output_gradients = _ones(end_node.shape, end_node.dtype)
+      output_gradients = array_ops.ones_like(end_node.value)
     grad = ag_core.backward_pass(output_gradients, end_node, start_node)
     return end_node.value, _aggregate_grads(grad.gradients)
 
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 010124ed56a..b4379055096 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -85,7 +85,7 @@ class BackpropTest(test.TestCase):
         initial_value=tensor.Tensor(1.0), name='x')
 
     def fn():
-      tape.watch(x.handle)
+      tape.watch_variable(x)
       b = tensor.Tensor(2.0)
       c = math_ops.add(x.value(), b)
       return math_ops.add(c, tensor.Tensor(3.0))
@@ -307,6 +307,20 @@ class BackpropTest(test.TestCase):
         [tensor_shape.TensorShape(s).as_proto() for s in shape_list],
         backprop.make_attr([pywrap_tensorflow.TF_ATTR_SHAPE], shape_list))
 
+  def testMultiValueConvertToTensor(self):
+    x = resource_variable_ops.ResourceVariable(
+        initial_value=array_ops.constant([1.0]), name='x')
+
+    def fn():
+      tape.watch_variable(x)
+      a = math_ops.add(x.value(), 1.0)
+      # Make sure convert_to_tensor works correctly with list of TensorNodes.
+      b = array_ops.stack([a, a], axis=0)
+      return math_ops.reduce_mean(b)
+
+    grad = backprop.implicit_grad(fn)()[0][1]
+    self.assertAllEqual([1.0], grad.numpy())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 27ffdd98105..a5a93b7bbe0 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -171,16 +171,6 @@ class Context(object):
     """Sets summary writer resource."""
     self._summary_writer_resource = resource
 
-  @property
-  def recording_summaries(self):
-    """Returns True if recording summaries is enabled in current thread.."""
-    return self._eager_context.recording_summaries
-
-  @recording_summaries.setter
-  def recording_summaries(self, val):
-    """Enables recording summaries is enabled in current thread.."""
-    self._eager_context.recording_summaries = val
-
   @property
   def device_name(self):
     """Returns the device name for the current thread."""
@@ -360,24 +350,6 @@ def device(name):
   return context().device(name)
 
 
-@contextlib.contextmanager
-def record_summaries():
-  """Context-manager to enable recording of summaries."""
-  ctx = context()
-  old = ctx.recording_summaries
-  ctx.recording_summaries = True
-  try:
-    yield
-  finally:
-    ctx.recording_summaries = old
-
-
-def should_record_summary():
-  """True if a summary should be recorded now."""
-  c = context()
-  return c.recording_summaries and c.summary_writer_resource is not None
-
-
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list.
 
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 7ae80aa156a..5de396f62c3 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -55,10 +55,6 @@ class TFETest(test_util.TensorFlowTestCase):
     ctx.summary_writer_resource = 'mock'
     self.assertEqual('mock', ctx.summary_writer_resource)
 
-    self.assertFalse(ctx.recording_summaries)
-    ctx.recording_summaries = True
-    self.assertTrue(ctx.recording_summaries)
-
     self.assertEqual('', ctx.device_name)
     self.assertEqual(ctx.device_name, ctx.device_spec.to_string())
     with ctx.device('GPU:0'):
@@ -95,8 +91,7 @@ class TFETest(test_util.TensorFlowTestCase):
       return [
           ctx.in_graph_mode(),
           ctx.in_eager_mode(), ctx.scope_name, ctx.summary_writer_resource,
-          ctx.recording_summaries, ctx.device_name,
-          ctx.num_gpus()
+          ctx.device_name, ctx.num_gpus()
       ]
 
     def get_values(ctx, values):
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 18b722e7923..c15dde9e487 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import function as tf_function
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 class FunctionTest(test.TestCase):
@@ -52,6 +53,19 @@ class FunctionTest(test.TestCase):
     out = sq(t)
     self.assertAllEqual(out.numpy(), math_ops.matmul(t, t).numpy())
 
+  def testGraphModeWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @function.defun
+    def step():
+      def inner():
+        tape.watch(v.handle)
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][1]
+
+    self.assertAllEqual(step().numpy(), 2.0)
+
   def testTensorConversionWithDefun(self):
 
     @function.defun
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index c46a3d8db37..62579bd23ae 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -659,14 +659,26 @@ void GenEagerPythonOp::AddEagerExecute(const string& num_outputs_expr) {
 
 string GetEagerPythonOps(const OpList& ops,
                          const std::vector<string>& hidden_ops,
-                         bool require_shapes) {
+                         bool require_shapes,
+                         const string& source_file_name = "") {
+
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
-  strings::StrAppend(&result, R"("""Python wrappers for TensorFlow ops.
+  strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
 
 This file is MACHINE GENERATED! Do not edit.
-"""
+)");
+ 
+  // Mention the original source file so someone tracing back through generated
+  // Python code will know where to look next.
+  if (!source_file_name.empty()) {
+    strings::StrAppend(&result, "Original C++ source file: ");
+    strings::StrAppend(&result, source_file_name);
+    strings::StrAppend(&result, "\n");
+  }
+ 
+  strings::StrAppend(&result, R"("""
 
 import collections as _collections
 
@@ -747,8 +759,11 @@ from tensorflow.python.framework import op_def_library as _op_def_library
 
 void PrintEagerPythonOps(const OpList& ops,
                          const std::vector<string>& hidden_ops,
-                         bool require_shapes) {
-  printf("%s", GetEagerPythonOps(ops, hidden_ops, require_shapes).c_str());
+                         bool require_shapes,
+                         const string& source_file_name) 
+{
+  printf("%s", GetEagerPythonOps(ops, hidden_ops, require_shapes, 
+                                 source_file_name).c_str());
 }
 
 string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len) {
diff --git a/tensorflow/python/eager/python_eager_op_gen.h b/tensorflow/python/eager/python_eager_op_gen.h
index 9a7ed28cf94..250623850f2 100644
--- a/tensorflow/python/eager/python_eager_op_gen.h
+++ b/tensorflow/python/eager/python_eager_op_gen.h
@@ -24,9 +24,12 @@ namespace tensorflow {
 
 // hidden_ops should be a list of Op names that should get a leading _
 // in the output. Prints the output to stdout.
+// Optional fourth argument is the name of the original C++ source file
+// where the ops' REGISTER_OP() calls reside.
 void PrintEagerPythonOps(const OpList& ops,
                          const std::vector<string>& hidden_ops,
-                         bool require_shapes);
+                         bool require_shapes,
+                         const string& source_file_name = "");
 
 // Get the python wrappers for a list of ops in a OpList.
 // `op_list_buf` should be a pointer to a buffer containing
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 4d09db73c97..9cd29f630df 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -151,6 +151,15 @@ def watch(tensor):
   return tensor
 
 
+def watch_variable(resource_variable):
+  """Marks this ResourceVariable to be watched by all tapes in the stack.
+
+  Args:
+    resource_variable: A ResourceVariable to be watched.
+  """
+  watch(resource_variable.handle)  # py-lint: disable=protected-access
+
+
 def pop_tape():
   """Pops the top tape in the stack, if any."""
   if _tape_stack.stack:
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 8d0f639ddcb..bd8e653b976 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -77,8 +77,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testMultiLineTensorStr(self):
     t = tensor.Tensor(np.eye(3))
     tensor_str = str(t)
-    self.assertIn("shape=%s, dtype=%s, " % (t.shape, t.dtype.name), tensor_str)
-    self.assertIn("numpy=\n%s" % t.numpy(), tensor_str)
+    self.assertIn("shape=%s, dtype=%s" % (t.shape, t.dtype.name), tensor_str)
+    self.assertIn(str(t.numpy()), tensor_str)
 
   def testMultiLineTensorRepr(self):
     t = tensor.Tensor(np.eye(3))
@@ -95,7 +95,7 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     np.set_printoptions(threshold=2, edgeitems=1)
 
     t = tensor.Tensor(np.arange(10, dtype=np.int32))
-    self.assertIn("numpy=[0 ..., 9]", str(t))
+    self.assertIn("[0 ..., 9]", str(t))
     self.assertIn("[0, ..., 9]", repr(t))
 
     # Clean up: reset to previous printoptions.
@@ -103,7 +103,7 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
   def testZeroDimTensorStr(self):
     t = tensor.Tensor(42)
-    self.assertIn("shape=(), dtype=int32, numpy=42", str(t))
+    self.assertIn("42, shape=(), dtype=int32", str(t))
 
   def testZeroDimTensorRepr(self):
     t = tensor.Tensor(42)
@@ -113,7 +113,7 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
   def testZeroSizeTensorStr(self):
     t = tensor.Tensor(np.zeros(0, dtype=np.float32))
-    self.assertIn("shape=(0,), dtype=float32, numpy=[]", str(t))
+    self.assertIn("[], shape=(0,), dtype=float32", str(t))
 
   def testZeroSizeTensorRepr(self):
     t = tensor.Tensor(np.zeros(0, dtype=np.float32))
@@ -127,8 +127,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = tensor.Tensor(42)
     # Force change dtype to a numpy-unprintable type.
     t._dtype = dtypes.resource
-    self.assertIn("numpy=<unprintable>", str(t))
-    self.assertIn("numpy=<unprintable>", repr(t))
+    self.assertIn("<unprintable>", str(t))
+    self.assertIn("<unprintable>", repr(t))
 
   def testStringTensor(self):
     t_np_orig = np.array([[b"a", b"ab"], [b"abc", b"abcd"]])
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 83eeeb35b67..167f9b10543 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -148,6 +148,7 @@ py_test(
     name = "dnn_test",
     size = "medium",
     srcs = ["canned/dnn_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -201,7 +202,7 @@ py_test(
     name = "dnn_linear_combined_test",
     size = "medium",
     srcs = ["canned/dnn_linear_combined_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -552,11 +553,9 @@ py_test(
     name = "linear_test",
     size = "medium",
     srcs = ["canned/linear_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "noasan",  # times out b/63680444
-    ],
+    tags = ["no_pip"],
     deps = [
         ":linear",
         ":linear_testing_utils",
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index d2c5772483b..80d109d927a 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -200,8 +200,11 @@ def _check_labels(labels, expected_labels_dimension):
         dim1 = static_shape[1]
         if (dim1 is not None) and (dim1 != expected_labels_dimension):
           raise ValueError(
-              'labels shape must be [batch_size, labels_dimension], got %s.' %
-              (static_shape,))
+              'Mismatched label shape. '
+              'Classifier configured with n_classes=%s.  Received %s. '
+              'Suggested Fix: check your n_classes argument to the estimator '
+              'and/or the shape of your label.' %
+              (expected_labels_dimension, dim1))
       assert_dimension = check_ops.assert_equal(
           expected_labels_dimension, labels_shape[1], message=err_msg)
       with ops.control_dependencies([assert_dimension]):
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 23678013c66..fa3d5b44eb6 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -139,7 +139,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     features = {'x': np.array(((42.,),))}
 
     # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
       head.create_loss(
           features=features,
           mode=model_fn.ModeKeys.EVAL,
@@ -889,7 +889,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     logits_2x1 = np.array(((45.,), (41.,),))
 
     # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
       head.create_loss(
           features={'x': np.array(((42.,),))},
           mode=model_fn.ModeKeys.EVAL,
@@ -1692,7 +1692,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     values_1d = np.array(((43.,), (44.,),))
 
     # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
       head.create_loss(
           features={'x': values_1d},
           mode=model_fn.ModeKeys.EVAL,
@@ -1737,7 +1737,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     values_1d = np.array(((43.,), (44.,),))
 
     # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
       head.create_loss(
           features={'x': values_1d},
           mode=model_fn.ModeKeys.TRAIN,
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index a8434d0c991..f64235d70b0 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2474,6 +2474,11 @@ class _IndicatorColumn(_DenseColumn,
           sp_ids=id_tensor,
           sp_values=weight_tensor,
           vocab_size=int(self._variable_shape[-1]))
+      # Remove (?, -1) index
+      weighted_column = sparse_ops.sparse_slice(
+          weighted_column,
+          [0, 0],
+          weighted_column.dense_shape)
       return sparse_ops.sparse_tensor_to_dense(weighted_column)
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 5138f31e981..e707770f8a3 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -3213,13 +3213,39 @@ class IndicatorColumnTest(test.TestCase):
     weights = fc.weighted_categorical_column(ids, 'weights')
     indicator = fc.indicator_column(weights)
     features = {
-      'ids': constant_op.constant(['c', 'b', 'a'], shape=(1, 3)),
-      'weights': constant_op.constant([2., 4., 6.], shape=(1, 3))
+      'ids': constant_op.constant([['c', 'b', 'a']]),
+      'weights': constant_op.constant([[2., 4., 6.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
       self.assertAllEqual([[6., 4., 2.]], indicator_tensor.eval())
 
+  def test_transform_with_missing_value_in_weighted_column(self):
+    # Github issue 12583
+    ids = fc.categorical_column_with_vocabulary_list(
+      key='ids', vocabulary_list=('a', 'b', 'c'))
+    weights = fc.weighted_categorical_column(ids, 'weights')
+    indicator = fc.indicator_column(weights)
+    features = {
+      'ids': constant_op.constant([['c', 'b', 'unknown']]),
+      'weights': constant_op.constant([[2., 4., 6.]])
+    }
+    indicator_tensor = _transform_features(features, [indicator])[indicator]
+    with _initialized_session():
+      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+
+  def test_transform_with_missing_value_in_categorical_column(self):
+    # Github issue 12583
+    ids = fc.categorical_column_with_vocabulary_list(
+      key='ids', vocabulary_list=('a', 'b', 'c'))
+    indicator = fc.indicator_column(ids)
+    features = {
+      'ids': constant_op.constant([['c', 'b', 'unknown']]),
+    }
+    indicator_tensor = _transform_features(features, [indicator])[indicator]
+    with _initialized_session():
+      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+
   def test_linear_model(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 2f35f0e04b6..7a866ee6e8a 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -26,7 +26,9 @@ import hashlib
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import op_def_pb2
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -290,6 +292,7 @@ class _DefinedFunction(object):
     self._shape_func = shape_func
     self._extra_kwargs = kwargs
     self._definition = None  # Constructed lazily.
+    self._c_func = None  # Constructed with definition.
     self._sub_functions = dict()  # Constructed with definition.
 
     self._args = []
@@ -396,6 +399,22 @@ class _DefinedFunction(object):
     if self._func.__doc__:
       self._definition.signature.description = self._func.__doc__
 
+    # pylint: disable=protected-access
+    if temp_graph._c_graph:
+      with errors.raise_exception_on_not_ok_status() as status:
+        output_names = ([compat.as_bytes(x) for x in self._out_names]
+                        if self._out_names else [])
+        self._c_func = c_api.TF_GraphToFunction_wrapper(
+            temp_graph._c_graph,
+            self._func_name,
+            None,  # opers
+            [t._as_tf_output() for t in inputs],
+            [t._as_tf_output() for t in outputs],
+            output_names,
+            None,  # opts
+            status)
+    # pylint: enable=protected-access
+
   def _create_hash_str(self, input_arg, output_arg, node_def):
     """Creates an 8-character string unique to this input.
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 589db9ef4dc..40205ddf053 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -63,7 +64,51 @@ def _OptimizerOptions():
                 do_constant_folding=cfold)))
 
 
-class FunctionTest(test.TestCase):
+class FunctionTestMethods(object):
+  """Test methods for verifying Function support.
+
+  These test methods are used as mix-ins in two test cases: with
+  and without C API support.
+  """
+
+  def testIdentity(self):
+
+    @function.Defun(dtypes.float32, func_name="MyIdentity")
+    def MyIdentityFunc(a):
+      return a
+
+    with ops.Graph().as_default():
+      call = MyIdentityFunc([18.0])
+      self.assertEqual("MyIdentity", call.op.name)
+      with session.Session() as sess:
+        self.assertAllEqual([18.0], sess.run(call))
+
+  def testIdentityOutputName(self):
+
+    @function.Defun(
+        dtypes.float32, func_name="MyIdentity", out_names=["my_result_name"])
+    def MyIdentityFunc(a):
+      return a
+
+    with ops.Graph().as_default():
+      call = MyIdentityFunc([18.0])
+      self.assertEqual("MyIdentity", call.op.name)
+      with session.Session() as sess:
+        self.assertAllEqual([18.0], sess.run(call))
+
+  def testTooManyOutputNames(self):
+
+    @function.Defun(
+        dtypes.float32, func_name="MyIdentity",
+        out_names=["my_result1", "my_result2"])
+    def MyIdentityFunc(a):
+      return a
+
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError, (r"Length of out_names \(2\) does not match number of "
+                       r"outputs \(1\): my_result1, my_result2")):
+        MyIdentityFunc([18.0])
 
   def testDefineFunction2Args(self):
 
@@ -77,6 +122,35 @@ class FunctionTest(test.TestCase):
       with session.Session() as sess:
         self.assertAllEqual([5.0], sess.run(call))
 
+  def testValueErrorOnFunctionWithNoOutput(self):
+    # TODO(iga): Remove this restriction and this test
+
+    @function.Defun(dtypes.float32, dtypes.float32)
+    def APlus2B(a, b):
+      print(a + b * 2)  # Create some ops to have nodes in the body
+                        # Using 'print' to make lint happy
+
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError,
+                                   "Function can not return None"):
+        APlus2B([1.0], [2.0])
+
+  def testDefineFunction2ArgsOutputName(self):
+
+    @function.Defun(
+        dtypes.float32,
+        dtypes.float32,
+        func_name="APlus2B",
+        out_names=["my_result_name"])
+    def APlus2B(a, b):
+      return a + b * 2
+
+    with ops.Graph().as_default():
+      call = APlus2B([1.0], [2.0])
+      self.assertEqual("APlus2B", call.op.name)
+      with session.Session() as sess:
+        self.assertAllEqual([5.0], sess.run(call))
+
   def testDefineFunctionDuplicateOutputs(self):
 
     @function.Defun(dtypes.float32, func_name="Duplicate")
@@ -137,6 +211,7 @@ class FunctionTest(test.TestCase):
       out, = sess.run(dx, feed)
     self.assertAllClose(1 - np.square(np.tanh(inp)), out)
 
+  @test_util.disable_c_api   # Function gradients don't work with C API
   def testCustomGradient(self):
     dtype = dtypes.float32
 
@@ -169,6 +244,7 @@ class FunctionTest(test.TestCase):
         out, = sess.run(dlogits, {logits: x, labels: y})
       self.assertAllClose(out, np.exp(prob - y))
 
+  @test_util.disable_c_api   # Function gradients don't work with C API
   def testCustomGradientError(self):
     dtype = dtypes.float32
 
@@ -194,6 +270,7 @@ class FunctionTest(test.TestCase):
           "SymGrad expects to return 1.*but get 2.*instead"):
         _ = sess.run(dinp, {inp: x})
 
+  @test_util.disable_c_api   # Function gradients don't work with C API
   def testSymGradShape(self):
     g = ops.Graph()
     with g.as_default():
@@ -209,6 +286,7 @@ class FunctionTest(test.TestCase):
       self.assertEqual(x.get_shape(), dx.get_shape())
       self.assertEqual(y.get_shape(), dy.get_shape())
 
+  @test_util.disable_c_api   # Function gradients don't work with C API
   def testSymGradAttr(self):
 
     @function.Defun(noinline=True)
@@ -312,6 +390,7 @@ class FunctionTest(test.TestCase):
                                    "assertion failed.*-3"):
         self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
 
+  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testAssertWrapper(self):
 
     @function.Defun(dtypes.float32)
@@ -326,6 +405,7 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testWhileLoopCallsFunc(self):
     with self.test_session(use_gpu=True) as sess:
 
@@ -345,6 +425,7 @@ class FunctionTest(test.TestCase):
       ans = sess.run(loop)
       self.assertAllClose(ans, 131072.)
 
+  @test_util.disable_c_api   # Op._add_control_inputs doesn't work with C API
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -607,6 +688,7 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(vals[0], vals[1])
       self.assertAllClose(vals[2], vals[3])
 
+  @test_util.disable_c_api   # Function Declaration doesn't work with C API
   def testDeclare(self):
     foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
                                                              dtypes.float32)])
@@ -626,6 +708,7 @@ class FunctionTest(test.TestCase):
       expected = rand * rand + 1.0
       self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
 
+  @test_util.disable_c_api   # Function Declaration doesn't work with C API
   def testDeclareUsedInDefun(self):
     foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
                                                              dtypes.float32)])
@@ -649,6 +732,7 @@ class FunctionTest(test.TestCase):
       expected = rand * rand + 1.0
       self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
 
+  @test_util.disable_c_api   # Function Declaration doesn't work with C API
   def testDeclareTypeMistake(self):
     foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
                                                              dtypes.float32)])
@@ -861,6 +945,32 @@ class FunctionTest(test.TestCase):
     self.assertEqual(len(f.signature.input_arg), 3)
 
 
+class FunctionTest(FunctionTestMethods, test.TestCase):
+  """Test case that invokes test methods with _USE_C_API=False."""
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API
+    ops._USE_C_API = False
+    super(FunctionTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api
+    super(FunctionTest, self).tearDown()
+
+
+class FunctionWithCApiTest(FunctionTestMethods, test.TestCase):
+  """Test case that invokes test methods with _USE_C_API=True."""
+
+  def setUp(self):
+    self.prev_use_c_api = ops._USE_C_API
+    ops._USE_C_API = True
+    super(FunctionWithCApiTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self.prev_use_c_api
+    super(FunctionWithCApiTest, self).tearDown()
+
+
 class FunctionsFromProtos(test.TestCase):
 
   def expectFunctionsEqual(self, func, grad_func=None, new_func=None):
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index aa373600669..76424ef579b 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -784,6 +784,7 @@ class OpDefLibrary(object):
                               if arg.is_ref]
       with _MaybeColocateWith(must_colocate_inputs):
         # Add Op to graph
+        inputs = [ag_core.getval(x) for x in inputs]
         op = g.create_op(op_type_name, inputs, output_types, name=scope,
                          input_types=input_types, attrs=attr_protos,
                          op_def=op_def)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 5a0c323ce47..b197e96886e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -49,6 +49,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 
 # Temporary global switch determining if we should enable the work-in-progress
@@ -604,6 +605,13 @@ def _maybe_modify_numpy_dtype_determination(np_array):
   return np_array
 
 
+def _has_string(value):
+  if isinstance(value, compat.bytes_or_text_types): return True
+  if isinstance(value, collections.Sequence) and value:
+    return _has_string(value[0])
+  return False
+
+
 # TODO(agarwal): rename to TensorHandle.
 class EagerTensor(Tensor):
   """A TensorFlow Eager Tensor."""
@@ -625,6 +633,8 @@ class EagerTensor(Tensor):
     # https://www.tensorflow.org/code/tensorflow/python/framework/constant_op.py
     self._id = uid()
     if not isinstance(value, np.ndarray):
+      if dtype is None and _has_string(value):
+        dtype = dtypes.string
       npt = None if dtype is None else dtype.as_numpy_dtype
       try:
         value = np.array(value, dtype=npt)
@@ -712,12 +722,12 @@ class EagerTensor(Tensor):
     return numpy_text
 
   def __str__(self):
-    return "tfe.Tensor(shape=%s, dtype=%s, numpy=%s)" % (self.shape,
-                                                         self.dtype.name,
-                                                         self._numpy_text())
+    return "tf.Tensor(%s, shape=%s, dtype=%s)" % (self._numpy_text(),
+                                                  self.shape,
+                                                  self.dtype.name)
 
   def __repr__(self):
-    return "<tfe.Tensor: id=%s, shape=%s, dtype=%s, numpy=%s)>" % (
+    return "<tf.Tensor: id=%s, shape=%s, dtype=%s, numpy=%s)>" % (
         self._id, self.shape, self.dtype.name, self._numpy_text(is_repr=True))
 
   @staticmethod
@@ -1027,12 +1037,19 @@ def internal_convert_to_tensor(value,
   # tracing gradients, to ensure the same behavior happens with and without
   # tracing.
   unwrapped = ag_core.getval(value)
-  # Fast path for EagerTensors that don't need any conversion.
-  if isinstance(unwrapped, EagerTensor) and context.in_eager_mode():
-    # Note that we don't check that value's dtype matches the dtype
-    # argument.  We exepct that the C runtime will do that checking
-    # when we execute the kernel.
-    return value
+
+  if context.in_eager_mode():
+    # Fast path for EagerTensors that don't need any conversion.
+    if isinstance(unwrapped, EagerTensor):
+      # Note that we don't check that value's dtype matches the dtype
+      # argument.  We exepct that the C runtime will do that checking
+      # when we execute the kernel.
+      return value
+    values = nest.flatten(value)
+    if (len(values) > 1 and
+        any(isinstance(ag_core.getval(v), EagerTensor) for v in values)):
+      raise TypeError("Cannot convert to a eager tensor.")
+
   if dtype is not None:
     dtype = dtypes.as_dtype(dtype)
   unwrapped_type = type(unwrapped)
@@ -2939,6 +2956,14 @@ class Graph(object):
     if self._graph_def_versions.min_consumer < 12:
       self._graph_def_versions.min_consumer = 12
     self._functions[name] = function
+    if self._c_graph:
+      # pylint: disable=protected-access
+      assert function._c_func, (
+          "Cannot add function created without C API support to graph "
+          "created with C API support")
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.TF_GraphAddFunction(self._c_graph, function._c_func, status)
+      # pylint: enable=protected-access
 
   @property
   def building_function(self):
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 83665422885..3cf56330e0d 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -80,7 +81,31 @@ Status ParseOpListCommandLine(const char* arg, std::vector<string>* op_list) {
   return Status::OK();
 }
 
-void PrintAllPythonOps(const std::vector<string>& op_list, bool require_shapes,
+  
+// Use the name of the current executable to infer the C++ source file
+// where the REGISTER_OP() call for the operator can be found.
+// Returns the name of the file.
+// Returns an empty string if the current executable's name does not
+// follow a known pattern.
+string InferSourceFileName(const char* argv_zero) {
+  StringPiece command_str = io::Basename(argv_zero);
+
+  // For built-in ops, the Bazel build creates a separate executable
+  // with the name gen_<op type>_ops_py_wrappers_cc containing the
+  // operators defined in <op type>_ops.cc
+  const char* kExecPrefix = "gen_";
+  const char* kExecSuffix = "_py_wrappers_cc";
+  if (command_str.Consume(kExecPrefix) && command_str.ends_with(kExecSuffix)) {
+    command_str.remove_suffix(strlen(kExecSuffix));
+    return strings::StrCat(command_str, ".cc");
+  } else {
+    return string("");
+  }
+}
+
+void PrintAllPythonOps(const std::vector<string>& op_list, 
+                       const string& source_file_name,
+                       bool require_shapes,
                        bool op_list_is_whitelist) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
@@ -93,9 +118,9 @@ void PrintAllPythonOps(const std::vector<string>& op_list, bool require_shapes,
         *pruned_ops.mutable_op()->Add() = op_def;
       }
     }
-    PrintEagerPythonOps(pruned_ops, {}, require_shapes);
+    PrintEagerPythonOps(pruned_ops, {}, require_shapes, source_file_name);
   } else {
-    PrintEagerPythonOps(ops, op_list, require_shapes);
+    PrintEagerPythonOps(ops, op_list, require_shapes, source_file_name);
   }
 }
 
@@ -105,20 +130,26 @@ void PrintAllPythonOps(const std::vector<string>& op_list, bool require_shapes,
 int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
+  tensorflow::string source_file_name =
+      tensorflow::InferSourceFileName(argv[0]);
+
   // Usage:
   //   gen_main [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1]
   if (argc == 2) {
-    tensorflow::PrintAllPythonOps({}, {}, tensorflow::string(argv[1]) == "1");
+    tensorflow::PrintAllPythonOps({}, source_file_name,
+                                  tensorflow::string(argv[1]) == "1",
+                                  false /* op_list_is_whitelist */);
   } else if (argc == 3) {
     std::vector<tensorflow::string> hidden_ops;
     TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &hidden_ops));
-    tensorflow::PrintAllPythonOps(hidden_ops,
+    tensorflow::PrintAllPythonOps(hidden_ops, source_file_name,
                                   tensorflow::string(argv[2]) == "1",
                                   false /* op_list_is_whitelist */);
   } else if (argc == 4) {
     std::vector<tensorflow::string> op_list;
     TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &op_list));
-    tensorflow::PrintAllPythonOps(op_list, tensorflow::string(argv[2]) == "1",
+    tensorflow::PrintAllPythonOps(op_list, source_file_name,
+                                  tensorflow::string(argv[2]) == "1",
                                   tensorflow::string(argv[3]) == "1");
   } else {
     return -1;
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index eea3d28a7e4..8c0975b11b3 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -236,7 +236,9 @@ def _FilterTuple(v):
 def _FilterInt(v):
   if isinstance(v, (list, tuple)):
     return _FirstNotNone([_FilterInt(x) for x in v])
-  return None if isinstance(v, compat.integral_types) else _NotNone(v)
+  return None if isinstance(
+          v,
+          (compat.integral_types, tensor_shape.Dimension)) else _NotNone(v)
 
 
 def _FilterFloat(v):
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 2760f98a6bc..ca47274e9a4 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -314,6 +314,17 @@ class TensorUtilTest(test.TestCase):
                   shape=[3, 4],
                   dtype=dtype)))
 
+  def testIntMixedWithDimension(self):
+    # Github issue: 11974
+    dtype = dtypes.int32
+    nptype = np.int32
+    t = tensor_util.make_tensor_proto([10, tensor_shape.Dimension(20), 30],
+                                      dtype=dtype)
+    self.assertEquals(dtype, t.dtype)
+    a = tensor_util.MakeNdarray(t)
+    self.assertEquals(nptype, a.dtype)
+    self.assertAllClose(np.array([10, 20, 30], dtype=nptype), a)
+
   def testLong(self):
     t = tensor_util.make_tensor_proto(10, dtype=dtypes.int64)
     self.assertProtoEquals("""
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index c65816a5436..04c7554a580 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -65,7 +65,7 @@ def gpu_device_name():
   """Returns the name of a GPU device if available or the empty string."""
   for x in device_lib.list_local_devices():
     if x.device_type == "GPU" or x.device_type == "SYCL":
-      return x.name
+      return compat.as_str(x.name)
   return ""
 
 
@@ -392,6 +392,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._cached_session = None
 
   def setUp(self):
+    logging.info("SET UP: %s" % str(self))
     self._ClearCachedSession()
     random.seed(random_seed.DEFAULT_GRAPH_SEED)
     np.random.seed(random_seed.DEFAULT_GRAPH_SEED)
@@ -406,6 +407,7 @@ class TensorFlowTestCase(googletest.TestCase):
     ops.get_default_graph().seed = random_seed.DEFAULT_GRAPH_SEED
 
   def tearDown(self):
+    logging.info("TEAR DOWN: %s" % str(self))
     for thread in self._threads:
       self.assertFalse(thread.is_alive(), "A checkedThread did not terminate")
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 4fa1e1fee80..d9c5f3bce99 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -518,7 +518,7 @@ tf_py_test(
 
 tf_py_test(
     name = "matrix_solve_ls_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["matrix_solve_ls_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -528,6 +528,7 @@ tf_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
+    tags = ["nomsan"],  # fails in msan from numpy calls
 )
 
 tf_py_test(
@@ -683,13 +684,15 @@ cuda_py_test(
 
 tf_py_test(
     name = "segment_reduction_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["segment_reduction_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
         "//tensorflow/python:nn_grad",
     ],
 )
@@ -1708,6 +1711,26 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
+cuda_py_test(
+    name = "reduction_ops_test_big",
+    size = "medium",
+    srcs = ["reduction_ops_test_big.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+    ],
+    tags = [
+        "manual",
+        "no_gpu",
+        "nogpu",
+        "noguitar",
+        "notap",
+    ],
+)
+
 cuda_py_test(
     name = "relu_op_test",
     size = "small",
@@ -2154,7 +2177,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
-    tags = ["noasan"],  # times out b/63680444
+    shard_count = 2,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 392639fa179..77c5bb6d400 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -981,15 +981,15 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
 
 class ConcatSliceResourceTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testConcatSlice(self):
-    with self.test_session():
-      r1 = test_ops.stub_resource_handle_op(container="a", shared_name="b")
-      r2 = test_ops.stub_resource_handle_op(container="a", shared_name="c")
-      c = array_ops.stack([r1, r2])
-      s = array_ops.strided_slice(c, [1], [2])
-      test_ops.resource_create_op(s).run()
-      with self.assertRaises(errors.AlreadyExistsError):
-        test_ops.resource_create_op(r2).run()
+    r1 = test_ops.stub_resource_handle_op(container="a", shared_name="b")
+    r2 = test_ops.stub_resource_handle_op(container="a", shared_name="c")
+    c = array_ops.stack([r1, r2])
+    s = array_ops.strided_slice(c, [1], [2])
+    self.evaluate(test_ops.resource_create_op(s))
+    with self.assertRaises(errors.AlreadyExistsError):
+      self.evaluate(test_ops.resource_create_op(r2))
 
 
 class IdentityTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index eb06e067a7f..de80fb30554 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -183,14 +183,11 @@ class CholeskyGradTest(test.TestCase):
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
 
-  # TODO(eriche): investigate why this test fails only in opensource
-  # ubuntu gpu python3
-
-  # def testSmallMatricesComplex(self):
-    # np.random.seed(0)
-    # shapes = self.getShapes([1, 2, 10])
-    # self.runFiniteDifferences(
-        # shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
+  def testSmallMatricesComplex(self):
+    np.random.seed(0)
+    shapes = self.getShapes([1, 2, 10])
+    self.runFiniteDifferences(
+        shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
   def testOneBlockMatrices(self):
     np.random.seed(0)
diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py
index 0e98afbe6e4..0b4fa60d81b 100644
--- a/tensorflow/python/kernel_tests/constant_op_eager_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py
@@ -26,27 +26,33 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.util import compat
 
 
-# TODO(josh11b): add tests with string types, lists/tuples, Shape.
+# TODO(josh11b): add tests with lists/tuples, Shape.
 class ConstantTest(test.TestCase):
 
   def _testCpu(self, x):
     np_ans = np.array(x)
-    tf_ans = ops.convert_to_tensor(x).numpy()
+    with context.device("/device:CPU:0"):
+      tf_ans = ops.convert_to_tensor(x).numpy()
     if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]:
       self.assertAllClose(np_ans, tf_ans)
     else:
       self.assertAllEqual(np_ans, tf_ans)
 
   def _testGpu(self, x):
-    np_ans = np.array(x)
-    tf_ans = ops.convert_to_tensor(x).numpy()
-    if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]:
-      self.assertAllClose(np_ans, tf_ans)
-    else:
-      self.assertAllEqual(np_ans, tf_ans)
+    device = test_util.gpu_device_name()
+    if device:
+      np_ans = np.array(x)
+      with context.device(device):
+        tf_ans = ops.convert_to_tensor(x).numpy()
+      if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]:
+        self.assertAllClose(np_ans, tf_ans)
+      else:
+        self.assertAllEqual(np_ans, tf_ans)
 
   def _testAll(self, x):
     self._testCpu(x)
@@ -78,11 +84,11 @@ class ConstantTest(test.TestCase):
 
   def testComplex64(self):
     self._testAll(
-        np.complex(1, 2) * np.arange(-15, 15).reshape([2, 3, 5
-                                                      ]).astype(np.complex64))
+        np.complex(1, 2) *
+        np.arange(-15, 15).reshape([2, 3, 5]).astype(np.complex64))
     self._testAll(
-        np.complex(1, 2) * np.random.normal(size=30).reshape(
-            [2, 3, 5]).astype(np.complex64))
+        np.complex(1, 2) *
+        np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex64))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex64))
 
   def testComplex128(self):
@@ -94,6 +100,26 @@ class ConstantTest(test.TestCase):
             [2, 3, 5]).astype(np.complex128))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex128))
 
+  def testString(self):
+    val = [compat.as_bytes(str(x)) for x in np.arange(-15, 15)]
+    self._testCpu(np.array(val).reshape([2, 3, 5]))
+    self._testCpu(np.empty((2, 0, 5)).astype(np.str_))
+
+  def testStringWithNulls(self):
+    val = ops.convert_to_tensor(b"\0\0\0\0").numpy()
+    self.assertEqual(len(val), 4)
+    self.assertEqual(val, b"\0\0\0\0")
+
+    val = ops.convert_to_tensor(b"xx\0xx").numpy()
+    self.assertEqual(len(val), 5)
+    self.assertAllEqual(val, b"xx\0xx")
+
+    nested = [[b"\0\0\0\0", b"xx\0xx"], [b"\0_\0_\0_\0", b"\0"]]
+    val = ops.convert_to_tensor(nested).numpy()
+    # NOTE(mrry): Do not use assertAllEqual, because it converts nested to a
+    #   numpy array, which loses the null terminators.
+    self.assertEqual(val.tolist(), nested)
+
   def testExplicitShapeNumPy(self):
     c = constant_op.constant(
         np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32),
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index df413939c76..6167cb9999b 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -119,11 +120,11 @@ class ConstantTest(test.TestCase):
           variant_val=[
               tensor_pb2.VariantTensorDataProto(
                   # Match registration in variant_op_registry.cc
-                  type_name=b"int32",
+                  type_name=b"int",
                   metadata=np.array(1, dtype=np.int32).tobytes())
           ])
-      const_op = constant_op.constant(variant_tensor).op
-      const_value = const_op.get_attr("value")
+      const = constant_op.constant(variant_tensor)
+      const_value = const.op.get_attr("value")
 
       # Ensure we stored the tensor proto properly.
       self.assertProtoEquals(variant_tensor, const_value)
@@ -134,7 +135,10 @@ class ConstantTest(test.TestCase):
       # native numpy types cannot be passed to ops.convert_to_tensor.
       # TODO(ebrevdo): Add registration mechanism for
       # ops.convert_to_tensor and for session.run output.
-      const_op.run()
+      logging_const_op = logging_ops.Print(
+          const, [const],
+          message="Variant storing an int, decoded const value:").op
+      logging_const_op.run()
 
   def testStringWithNulls(self):
     with self.test_session():
@@ -469,6 +473,35 @@ class ZerosLikeTest(test.TestCase):
           self.assertEqual(y.shape, shape)
           self.assertAllEqual(y, np.zeros(shape, dtype=out_type))
 
+  def testZerosLikeVariant(self):
+    # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
+    # copying between CPU and GPU is supported AND we register a
+    # ZerosLike callback for GPU for Variant storing primitive types
+    # in variant_op_registry.cc.
+    with self.test_session(use_gpu=False):
+      variant_tensor = tensor_pb2.TensorProto(
+          dtype=dtypes_lib.variant.as_datatype_enum,
+          tensor_shape=tensor_shape.TensorShape([]).as_proto(),
+          variant_val=[
+              tensor_pb2.VariantTensorDataProto(
+                  # Match registration in variant_op_registry.cc
+                  type_name=b"int",
+                  metadata=np.array(1, dtype=np.int32).tobytes())
+          ])
+      const_variant = constant_op.constant(variant_tensor)
+      zeros_like = array_ops.zeros_like(const_variant)
+      zeros_like_op = logging_ops.Print(
+          zeros_like, [const_variant, zeros_like],
+          message="Variant storing an int, input and output of zeros_like:").op
+
+      # Smoke test -- ensure this executes without trouble.
+      # Right now, non-numpy-compatible objects cannot be returned from a
+      # session.run call; similarly, objects that can't be converted to
+      # native numpy types cannot be passed to ops.convert_to_tensor.
+      # TODO(ebrevdo): Add registration mechanism for
+      # ops.convert_to_tensor and for session.run output.
+      zeros_like_op.run()
+
 
 class OnesTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 85e7b635d80..748135440ec 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -1078,6 +1078,9 @@ class FIFOQueueTest(test.TestCase):
       self.assertEqual([50.0], dequeued_t.eval())
       self.assertEqual([60.0], dequeued_t.eval())
 
+      # Make sure the thread finishes before exiting.
+      thread.join()
+
   def testBlockingEnqueueBeforeClose(self):
     with self.test_session() as sess:
       q = data_flow_ops.FIFOQueue(4, dtypes_lib.float32)
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 53b1897f488..d8c3f9823c3 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -1191,6 +1191,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       self.assertEqual([50.0], dequeued_t.eval())
       self.assertEqual([60.0], dequeued_t.eval())
 
+      # Make sure the thread finishes before exiting.
+      thread.join()
+
   def testBlockingEnqueueBeforeClose(self):
     with self.test_session() as sess:
       q = data_flow_ops.PaddingFIFOQueue(4, dtypes_lib.float32, ((),))
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index fa1553a3f6b..b01fc129538 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -321,6 +321,15 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  def testMaxPoolGradValidPadding1_2_3d(self):
+    self._ConstructAndTestGradient(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 3, 3, 3, 1],
+        output_sizes=[1, 2, 2, 2, 1],
+        window=(1, 1, 1),
+        strides=(2, 2, 2),
+        padding="VALID")
+
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -339,6 +348,15 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  def testMaxPoolGradSamePadding1_2_3d(self):
+    self._ConstructAndTestGradient(
+        nn_ops.max_pool3d,
+        input_sizes=[1, 3, 2, 4, 1],
+        output_sizes=[1, 2, 1, 2, 1],
+        window=(1, 1, 1),
+        strides=(2, 2, 2),
+        padding="SAME")
+
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -375,6 +393,15 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  def testAvgPoolGradValidPadding1_2_3d(self):
+    self._ConstructAndTestGradient(
+        nn_ops.avg_pool3d,
+        input_sizes=[1, 3, 3, 3, 1],
+        output_sizes=[1, 2, 2, 2, 1],
+        window=(1, 1, 1),
+        strides=(2, 2, 2),
+        padding="VALID")
+
   def testAvgPoolGradValidPadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -402,6 +429,15 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  def testAvgPoolGradSamePadding1_2_3d(self):
+    self._ConstructAndTestGradient(
+        nn_ops.avg_pool3d,
+        input_sizes=[1, 3, 2, 4, 2],
+        output_sizes=[1, 2, 1, 2, 2],
+        window=(1, 1, 1),
+        strides=(2, 2, 2),
+        padding="SAME")
+
   def testAvgPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index da14871c872..9eb1fea8037 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -998,6 +998,20 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  def _testMaxPoolGradValidPadding1_2(self, data_format, use_gpu):
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 3, 3, 1],
+          output_sizes=[1, 2, 2, 1],
+          window_rows=1,
+          window_cols=1,
+          row_stride=2,
+          col_stride=2,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
+
   def _testMaxPoolGradValidPadding2_2(self, data_format, use_gpu):
     for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
@@ -1026,6 +1040,20 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  def _testMaxPoolGradSamePadding1_2(self, data_format, use_gpu):
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[2, 2, 4, 3],
+          output_sizes=[2, 1, 2, 3],
+          window_rows=1,
+          window_cols=1,
+          row_stride=2,
+          col_stride=2,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
+
   def _testMaxPoolGradSamePadding2_1(self, data_format, use_gpu):
     for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
@@ -1071,10 +1099,12 @@ class PoolingTest(test.TestCase):
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradValidPadding1_1(data_format, use_gpu)
+      self._testMaxPoolGradValidPadding1_2(data_format, use_gpu)
       self._testMaxPoolGradValidPadding2_1_6(data_format, use_gpu)
       self._testMaxPoolGradValidPadding2_1_7(data_format, use_gpu)
       self._testMaxPoolGradValidPadding2_2(data_format, use_gpu)
       self._testMaxPoolGradSamePadding1_1(data_format, use_gpu)
+      self._testMaxPoolGradSamePadding1_2(data_format, use_gpu)
       self._testMaxPoolGradSamePadding2_1(data_format, use_gpu)
       self._testMaxPoolGradSamePadding2_2(data_format, use_gpu)
       self._testMaxPoolGradSamePadding3_1(data_format, use_gpu)
@@ -1497,9 +1527,11 @@ class PoolingTest(test.TestCase):
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
+      self._testAvgPoolGradValidPadding1_2(data_format, use_gpu)
       self._testAvgPoolGradValidPadding2_1(data_format, use_gpu)
       self._testAvgPoolGradValidPadding2_2(data_format, use_gpu)
       self._testAvgPoolGradSamePadding1_1(data_format, use_gpu)
+      self._testAvgPoolGradSamePadding1_2(data_format, use_gpu)
       self._testAvgPoolGradSamePadding2_1(data_format, use_gpu)
       self._testAvgPoolGradSamePadding2_2(data_format, use_gpu)
       self._testAvgPoolGradSamePadding3_1(data_format, use_gpu)
@@ -1517,6 +1549,19 @@ class PoolingTest(test.TestCase):
         data_format=data_format,
         use_gpu=use_gpu)
 
+  def _testAvgPoolGradValidPadding1_2(self, data_format, use_gpu):
+    self._ConstructAndTestGradient(
+        nn_ops.avg_pool,
+        input_sizes=[2, 3, 3, 3],
+        output_sizes=[2, 2, 2, 3],
+        window_rows=1,
+        window_cols=1,
+        row_stride=2,
+        col_stride=2,
+        padding="VALID",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
   def _testAvgPoolGradValidPadding2_1(self, data_format, use_gpu):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool,
@@ -1556,6 +1601,19 @@ class PoolingTest(test.TestCase):
         data_format=data_format,
         use_gpu=use_gpu)
 
+  def _testAvgPoolGradSamePadding1_2(self, data_format, use_gpu):
+    self._ConstructAndTestGradient(
+        nn_ops.avg_pool,
+        input_sizes=[2, 2, 4, 3],
+        output_sizes=[2, 1, 2, 3],
+        window_rows=1,
+        window_cols=1,
+        row_stride=2,
+        col_stride=2,
+        padding="SAME",
+        data_format=data_format,
+        use_gpu=use_gpu)
+
   def _testAvgPoolGradSamePadding2_1(self, data_format, use_gpu):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool,
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 04ce99a4a63..8d6b7925e45 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -175,6 +175,24 @@ class SumReductionTest(BaseReductionTest):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  def testFloat16(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float16)
+      self._compareAllAxes(np_arr)
+
+    # test that mean doesn't overflow
+    # only on GPU, since it has the more accurate implementation
+    if not test.is_gpu_available():
+      return
+
+    arr = np.ones([68000], dtype=np.float16)
+
+    with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+      tf_arr = array_ops.constant(arr)
+      tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
+      tf_out_mean = sess.run(tf_mean)
+    self.assertAllClose(tf_out_mean, 1.)
+
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
@@ -523,7 +541,7 @@ class MinReductionTest(test.TestCase):
   def testFloatReduce3D(self):
     # Create a 3D array of floats and reduce across all possible
     # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
+    np_arr = np.arange(1, 31).reshape([2, 3, 5]).astype(np.float32)
     self._compareAll(np_arr, None)
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
@@ -537,7 +555,7 @@ class MinReductionTest(test.TestCase):
   def testDoubleReduce3D(self):
     # Create a 3D array of doubles and reduce across all possible
     # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float64)
+    np_arr = np.arange(1, 31).reshape([2, 3, 5]).astype(np.float64)
     self._compareAll(np_arr, None)
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
@@ -629,7 +647,7 @@ class MaxReductionTest(test.TestCase):
   def testFloatReduce3D(self):
     # Create a 3D array of floats and reduce across all possible
     # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
+    np_arr = np.arange(-31, -1).reshape([2, 3, 5]).astype(np.float32)
     self._compareAll(np_arr, None)
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
@@ -643,7 +661,7 @@ class MaxReductionTest(test.TestCase):
   def testDoubleReduce3D(self):
     # Create a 3D array of doubles and reduce across all possible
     # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float64)
+    np_arr = np.arange(-31, -1).reshape([2, 3, 5]).astype(np.float64)
     self._compareAll(np_arr, None)
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
@@ -656,7 +674,7 @@ class MaxReductionTest(test.TestCase):
 
   def testGradient(self):
     s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
+    x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
     with self.test_session():
       t = ops.convert_to_tensor(x)
       su = math_ops.reduce_max(t, [1, 2])
@@ -666,7 +684,7 @@ class MaxReductionTest(test.TestCase):
 
   def testGradient2(self):
     s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
+    x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
     with self.test_session():
       t = ops.convert_to_tensor(x)
       su = math_ops.reduce_max(t, [1])
@@ -676,7 +694,7 @@ class MaxReductionTest(test.TestCase):
 
   def testGradient3(self):
     s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
+    x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
     with self.test_session():
       t = ops.convert_to_tensor(x)
       su = math_ops.reduce_max(t, [2])
@@ -686,7 +704,7 @@ class MaxReductionTest(test.TestCase):
 
   def testGradient4(self):
     s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
+    x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
     with self.test_session():
       t = ops.convert_to_tensor(x)
       su = math_ops.reduce_max(t)
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
new file mode 100644
index 00000000000..0959adb026e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
@@ -0,0 +1,179 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for reduction ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BaseReductionTest(test.TestCase):
+
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    raise NotImplementedError()
+
+
+class BigReductionTest(BaseReductionTest):
+  """Test reductions for sum and boolean all over a wide range of shapes."""
+
+  def _tf_reduce_max(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_max(x, reduction_axes, keep_dims)
+
+  def _tf_reduce_all(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_all(x, reduction_axes, keep_dims)
+
+  def _tf_reduce_mean(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_mean(x, reduction_axes, keep_dims)
+
+  def _tf_reduce_sum(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+
+  def testFloat32Sum(self):
+    # make sure we test all possible kernel invocations
+    # logic is the same for all ops, test just float32 for brevity
+    arr_ = np.ones([4097, 4097], dtype=np.float32)
+    for size_x in [
+        1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025,
+        4096, 4097
+    ]:
+      for size_y in [
+          1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025,
+          4096, 4097
+      ]:
+        arr = arr_[0:size_x, 0:size_y]
+        col_sum = np.ones([size_y], dtype=np.float32) * size_x
+        row_sum = np.ones([size_x], dtype=np.float32) * size_y
+        full_sum = np.ones([], dtype=np.float32) * size_x * size_y
+
+        with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+          tf_row_sum = self._tf_reduce_sum(arr, 1, False)
+          tf_col_sum = self._tf_reduce_sum(arr, 0, False)
+          tf_full_sum = self._tf_reduce_sum(arr, [0, 1], False)
+          tf_out_row, tf_out_col, tf_out_full = sess.run(
+              [tf_row_sum, tf_col_sum, tf_full_sum])
+        self.assertAllClose(col_sum, tf_out_col)
+        self.assertAllClose(row_sum, tf_out_row)
+        self.assertAllClose(full_sum, tf_out_full)
+
+    arr_ = np.ones([130, 130, 130], dtype=np.float32)
+    for size_x in range(1, 130, 13):
+      for size_y in range(1, 130, 13):
+        for size_z in range(1, 130, 13):
+          arr = arr_[0:size_x, 0:size_y, 0:size_z]
+          sum_y = np.ones([size_x, size_z], dtype=np.float32)
+          sum_xz = np.ones([size_y], dtype=np.float32)
+
+          with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+            tf_sum_xz = self._tf_reduce_mean(arr, [0, 2], False)
+            tf_sum_y = self._tf_reduce_mean(arr, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+          self.assertAllClose(sum_y, tf_out_sum_y)
+          self.assertAllClose(sum_xz, tf_out_sum_xz)
+
+  def testFloat32Max(self):
+    # make sure we test all possible kernel invocations
+    # logic is the same for all ops, test just float32 for brevity
+    arr_ = np.random.uniform(
+        low=-3, high=-1, size=[4105, 4105]).astype(np.float32)
+    for size_x in [
+        1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025,
+        4096, 4097
+    ]:
+      for size_y in [
+          1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025,
+          4096, 4097
+      ]:
+        arr = arr_[0:size_x, 0:size_y]
+        col_max = np.max(arr, axis=0)
+        row_max = np.max(arr, axis=1)
+        full_max = np.max(col_max)
+
+        with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+          tf_row_max = self._tf_reduce_max(arr, 1, False)
+          tf_col_max = self._tf_reduce_max(arr, 0, False)
+          tf_full_max = self._tf_reduce_max(arr, [0, 1], False)
+          tf_out_row, tf_out_col, tf_out_full = sess.run(
+              [tf_row_max, tf_col_max, tf_full_max])
+        self.assertAllClose(col_max, tf_out_col)
+        self.assertAllClose(row_max, tf_out_row)
+        self.assertAllClose(full_max, tf_out_full)
+
+    arr_ = np.random.uniform(
+        low=-3, high=-1, size=[130, 130, 130]).astype(np.float32)
+    for size_x in range(1, 130, 13):
+      for size_y in range(1, 130, 13):
+        for size_z in range(1, 130, 13):
+          arr = arr_[0:size_x, 0:size_y, 0:size_z]
+          sum_y = np.max(arr, axis=1)
+          sum_xz = np.max(arr, axis=(0, 2))
+
+          with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+            tf_sum_xz = self._tf_reduce_max(arr, [0, 2], False)
+            tf_sum_y = self._tf_reduce_max(arr, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+          self.assertAllClose(sum_y, tf_out_sum_y)
+          self.assertAllClose(sum_xz, tf_out_sum_xz)
+
+  def testBooleanAll(self):
+    # make sure we test all possible kernel invocations
+    # test operation where T(0) is not the identity
+    arr_ = np.ones([4097, 4097], dtype=np.bool)
+    for size_x in [
+        1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025,
+        4096, 4097
+    ]:
+      for size_y in [
+          1, 2, 3, 4, 16, 17, 32, 33, 64, 65, 128, 131, 256, 263, 1024, 1025,
+          4096, 4097
+      ]:
+        arr = arr_[0:size_x, 0:size_y]
+        col_sum = np.ones([size_y], dtype=np.bool)
+        row_sum = np.ones([size_x], dtype=np.bool)
+        full_sum = np.ones([1], dtype=np.bool).reshape([])
+
+        with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+          tf_row_sum = self._tf_reduce_all(arr, 1, False)
+          tf_col_sum = self._tf_reduce_all(arr, 0, False)
+          tf_full_sum = self._tf_reduce_all(arr, [0, 1], False)
+          tf_out_row, tf_out_col, tf_out_full = sess.run(
+              [tf_row_sum, tf_col_sum, tf_full_sum])
+        self.assertAllClose(col_sum, tf_out_col)
+        self.assertAllClose(row_sum, tf_out_row)
+        self.assertAllClose(full_sum, tf_out_full)
+
+    arr_ = np.ones([130, 130, 130], dtype=np.bool)
+    for size_x in range(1, 130, 13):
+      for size_y in range(1, 130, 13):
+        for size_z in range(1, 130, 13):
+          arr = arr_[0:size_x, 0:size_y, 0:size_z]
+          sum_y = np.ones([size_x, size_z], dtype=np.bool)
+          sum_xz = np.ones([size_y], dtype=np.bool)
+
+          with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+            tf_sum_xz = self._tf_reduce_all(arr, [0, 2], False)
+            tf_sum_y = self._tf_reduce_all(arr, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+          self.assertAllClose(sum_y, tf_out_sum_y)
+          self.assertAllClose(sum_xz, tf_out_sum_xz)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 33269c91234..5e426fc61a7 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -18,12 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 import numpy as np
 
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
@@ -107,19 +112,19 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         curr_ops_list = complex_ops_list
       else:
         curr_ops_list = ops_list
-
-      with self.test_session(use_gpu=False):
-        tf_x, np_x = self._input(shape, dtype=dtype)
-        for np_op1, np_op2, tf_op in curr_ops_list:
-          np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2)
-          s = tf_op(data=tf_x, segment_ids=indices)
-          tf_ans = s.eval()
-          self.assertAllClose(np_ans, tf_ans)
-          # NOTE(mrry): The static shape inference that computes
-          # `tf_ans.shape` can only infer that sizes from dimension 1
-          # onwards, because the size of dimension 0 is data-dependent
-          # and may therefore vary dynamically.
-          self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
+      for use_gpu in [True, False]:
+        with self.test_session(use_gpu=use_gpu):
+          tf_x, np_x = self._input(shape, dtype=dtype)
+          for np_op1, np_op2, tf_op in curr_ops_list:
+            np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2)
+            s = tf_op(data=tf_x, segment_ids=indices)
+            tf_ans = s.eval()
+            self.assertAllClose(np_ans, tf_ans)
+            # NOTE(mrry): The static shape inference that computes
+            # `tf_ans.shape` can only infer that sizes from dimension 1
+            # onwards, because the size of dimension 0 is data-dependent
+            # and may therefore vary dynamically.
+            self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
 
   def testSegmentIdsShape(self):
     shape = [4, 4]
@@ -130,41 +135,45 @@ class SegmentReductionOpTest(SegmentReductionHelper):
 
   def testSegmentIdsSize(self):
     shape = [4, 4]
-    with self.test_session():
-      tf_x, _ = self._input(shape)
-      indices = [0, 1]
-      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      with self.assertRaisesOpError("segment_ids should be the same size"):
-        s.eval()
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        tf_x, _ = self._input(shape)
+        indices = [0, 1]
+        s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+        with self.assertRaisesOpError("segment_ids should be the same size"):
+          s.eval()
 
   def testSegmentIdsValid(self):
     # This is a baseline for the following SegmentIdsInvalid* tests.
     shape = [4, 4]
-    with self.test_session():
-      tf_x, _ = self._input(shape)
-      indices = [0, 0, 0, 1]
-      result = math_ops.segment_sum(data=tf_x, segment_ids=indices).eval()
-      self.assertAllEqual([[15, 18, 21, 24], [13, 14, 15, 16]], result)
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        tf_x, _ = self._input(shape, dtype=dtypes_lib.float32)
+        indices = [0, 0, 0, 1]
+        result = math_ops.segment_sum(data=tf_x, segment_ids=indices).eval()
+        self.assertAllEqual([[15, 18, 21, 24], [13, 14, 15, 16]], result)
 
   def testSegmentIdsGreaterThanZero(self):
     shape = [4, 4]
-    with self.test_session():
-      tf_x, np_x = self._input(shape)
-      indices = [1, 1, 2, 2]
-      np_ans = self._segmentReduce(indices, np_x, np.add)
-      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      tf_ans = s.eval()
-      self.assertAllClose(np_ans, tf_ans)
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float32)
+        indices = [1, 1, 2, 2]
+        np_ans = self._segmentReduce(indices, np_x, np.add)
+        s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+        tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
 
   def testSegmentIdsHole(self):
     shape = [4, 4]
-    with self.test_session():
-      tf_x, np_x = self._input(shape)
-      indices = [0, 0, 3, 3]
-      np_ans = self._segmentReduce(indices, np_x, np.add)
-      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      tf_ans = s.eval()
-      self.assertAllClose(np_ans, tf_ans)
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float32)
+        indices = [0, 0, 3, 3]
+        np_ans = self._segmentReduce(indices, np_x, np.add)
+        s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+        tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
 
   def testSegmentIdsInvalid1(self):
     shape = [4, 4]
@@ -199,21 +208,23 @@ class SegmentReductionOpTest(SegmentReductionHelper):
 
   def testSegmentIdsInvalid4(self):
     shape = [4, 4]
-    with self.test_session():
-      tf_x, _ = self._input(shape)
-      indices = [0, 0, 0, -1]
-      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      with self.assertRaisesOpError("segment ids must be >= 0"):
-        s.eval()
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        tf_x, _ = self._input(shape, dtype=dtypes_lib.float32)
+        indices = [0, 0, 0, -1]
+        s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+        with self.assertRaisesOpError("segment ids must be >= 0"):
+          s.eval()
 
   def testSegmentIdsInvalid5(self):
     shape = [4, 4]
-    with self.test_session():
-      tf_x, _ = self._input(shape)
-      indices = [0, 0, 0, -2]
-      s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      with self.assertRaisesOpError("segment ids must be >= 0"):
-        s.eval()
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        tf_x, _ = self._input(shape, dtype=dtypes_lib.float32)
+        indices = [0, 0, 0, -2]
+        s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+        with self.assertRaisesOpError("segment ids must be >= 0"):
+          s.eval()
 
   def testGradient(self):
     shape = [4, 4]
@@ -341,7 +352,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
       with self.test_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
         s = math_ops.unsorted_segment_max(data=tf_x, segment_ids=indices,
-                                    num_segments=num_segments)
+                                          num_segments=num_segments)
         jacob_t, jacob_n = gradient_checker.compute_gradient(
             tf_x,
             shape,
@@ -635,6 +646,64 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"):
           s.eval()
 
+class SegmentReductionOpBenchmark(test.Benchmark):
+  outer_dim_options = [2**x for x in range(9, 14, 2)]
+  ratio_options = [2**x for x in range(1, 6, 2)]
+  inner_dim_options = [2**x for x in range(9, 14, 2)]
+  #randomly generated sizes with less alignments
+  inner_dim_options += [1120, 1215, 1856, 1302, 1329, 1531, 1313, 1672, 1851, 1584]
+  dtype_options = [np.float32, np.float64]
+  options = (outer_dim_options,
+             ratio_options, inner_dim_options, dtype_options)
+  op_functors = [lambda vc, vs, seg_ids:
+                 ("sorted", math_ops.segment_sum(vc, vs)),
+                 lambda vc, vs, seg_ids:
+                 ("unsorted", math_ops.unsorted_segment_sum(vc, vs, seg_ids[-1]+1))]
+  repeat = 10
+
+  def _npTypeToStr(self, t):
+    if t == np.float32:
+      return "fp32"
+    if t == np.float64:
+      return "fp64"
+
+  def _runGraph(self, op_functor, outer_dim, ratio, inner_dim, dtype):
+    output_outer_dim = int(outer_dim/ratio)
+    const = np.random.randint(5, size=(outer_dim, inner_dim))
+    seg_ids = np.sort(np.random.randint(
+        output_outer_dim, size=outer_dim))
+    vs = variables.Variable(seg_ids.astype(np.int32))
+    with ops.device("/gpu:0"):
+      vc = variables.Variable(const.astype(dtype))
+    name, op = op_functor(vc, vs, seg_ids)
+    with session.Session() as sess:
+      variables.global_variables_initializer().run()
+      r = self.run_op_benchmark(sess, op, min_iters=self.repeat,
+                                name="_".join(map(str,
+                                                  [name,
+                                                   outer_dim,
+                                                   ratio,
+                                                   inner_dim,
+                                                   self._npTypeToStr(dtype)])))
+    return name, r["wall_time"]
+
+  def benchmarkSegmentSumGPU(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for outer_dim, ratio, inner_dim, dtype in itertools.product(*self.options):
+      output_outer_dim = int(outer_dim/ratio)
+      op_functor = self.op_functors[0]
+      with ops.Graph().as_default():
+        self._runGraph(op_functor, outer_dim, ratio, inner_dim, dtype)
+
+  def benchmarkUnsortedSegmentSumGPU(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for outer_dim, ratio, inner_dim, dtype in itertools.product(*self.options):
+      output_outer_dim = int(outer_dim/ratio)
+      op_functor = self.op_functors[1]
+      with ops.Graph().as_default():
+        self._runGraph(op_functor, outer_dim, ratio, inner_dim, dtype)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 68293aa5fe5..41c67743b6d 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -24,6 +24,7 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -171,7 +172,7 @@ class _Conv(base.Layer):
         padding=self.padding.upper(),
         data_format=utils.convert_data_format(self.data_format, self.rank + 2))
 
-    if self.bias is not None:
+    if self.use_bias:
       if self.data_format == 'channels_first':
         if self.rank == 1:
           # nn.bias_add does not accept a 1D input tensor.
@@ -988,7 +989,7 @@ class SeparableConv2D(Conv2D):
         rate=self.dilation_rate,
         data_format=utils.convert_data_format(self.data_format, ndim=4))
 
-    if self.bias is not None:
+    if self.use_bias:
       outputs = nn.bias_add(
           outputs,
           self.bias,
@@ -1293,20 +1294,21 @@ class Conv2DTranspose(Conv2D):
         padding=self.padding.upper(),
         data_format=utils.convert_data_format(self.data_format, ndim=4))
 
-    # Infer the static output shape:
-    out_shape = inputs.get_shape().as_list()
-    out_shape[c_axis] = self.filters
-    out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                   kernel_h,
-                                                   self.padding,
-                                                   stride_h)
-    out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                   kernel_w,
-                                                   self.padding,
-                                                   stride_w)
-    outputs.set_shape(out_shape)
+    if context.in_graph_mode():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
+                                                     kernel_h,
+                                                     self.padding,
+                                                     stride_h)
+      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
+                                                     kernel_w,
+                                                     self.padding,
+                                                     stride_w)
+      outputs.set_shape(out_shape)
 
-    if self.bias:
+    if self.use_bias:
       outputs = nn.bias_add(
           outputs,
           self.bias,
@@ -1591,24 +1593,25 @@ class Conv3DTranspose(Conv3D):
         data_format=utils.convert_data_format(self.data_format, ndim=5),
         padding=self.padding.upper())
 
-    # Infer the static output shape:
-    out_shape = inputs.get_shape().as_list()
-    out_shape[c_axis] = self.filters
-    out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
-                                                   kernel_d,
-                                                   self.padding,
-                                                   stride_d)
-    out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                   kernel_h,
-                                                   self.padding,
-                                                   stride_h)
-    out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                   kernel_w,
-                                                   self.padding,
-                                                   stride_w)
-    outputs.set_shape(out_shape)
+    if context.in_graph_mode():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
+                                                     kernel_d,
+                                                     self.padding,
+                                                     stride_d)
+      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
+                                                     kernel_h,
+                                                     self.padding,
+                                                     stride_h)
+      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
+                                                     kernel_w,
+                                                     self.padding,
+                                                     stride_w)
+      outputs.set_shape(out_shape)
 
-    if self.bias:
+    if self.use_bias:
       outputs_shape = outputs.shape.as_list()
       if self.data_format == 'channels_first':
         outputs_4d = array_ops.reshape(outputs, [
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 62f5881f164..1fc2d70f9ca 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -25,6 +25,7 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import ops
@@ -242,15 +243,20 @@ class BatchNormalization(base.Layer):
                                   initializer=init_ops.zeros_initializer(),
                                   trainable=False)
           return var
+
         with ops.device(None):
-          with ops.device(lambda _: self.moving_mean.device):
+          device = ((lambda _: self.moving_mean.device)
+                    if context.in_graph_mode() else self.moving_mean.device)
+          with ops.device(device):
             self.renorm_mean = _renorm_variable('renorm_mean', (param_dim,))
             self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
           # We initialize renorm_stddev to 0, and maintain the (0-initialized)
           # renorm_stddev_weight. This allows us to (1) mix the average
           # stddev with the minibatch stddev early in training, and (2) compute
           # the unbiased average stddev by dividing renorm_stddev by the weight.
-          with ops.device(lambda _: self.moving_variance.device):
+          device = ((lambda _: self.moving_variance.device)
+                    if context.in_graph_mode() else self.moving_variance.device)
+          with ops.device(device):
             self.renorm_stddev = _renorm_variable('renorm_stddev', (param_dim,))
             self.renorm_stddev_weight = _renorm_variable(
                 'renorm_stddev_weight', ())
@@ -301,8 +307,12 @@ class BatchNormalization(base.Layer):
           self.moving_mean, mean, decay, zero_debias=False)
       variance_update = moving_averages.assign_moving_average(
           self.moving_variance, variance, decay, zero_debias=False)
-      self.add_update(mean_update, inputs=inputs)
-      self.add_update(variance_update, inputs=inputs)
+      if context.in_graph_mode():
+        # Note that in Eager mode, the updates are already executed when running
+        # assign_moving_averages. So we do not need to put them into
+        # collections.
+        self.add_update(mean_update, inputs=inputs)
+        self.add_update(variance_update, inputs=inputs)
 
     return output
 
@@ -335,6 +345,7 @@ class BatchNormalization(base.Layer):
     r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
     d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
     decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.)
+
     def _update_renorm_variable(var, weight, value):
       """Updates a moving average and weight, returns the unbiased value."""
       # Update the variables without zero debiasing. The debiasing will be
@@ -418,9 +429,9 @@ class BatchNormalization(base.Layer):
           self.moving_mean, new_mean, decay, zero_debias=False)
       variance_update = moving_averages.assign_moving_average(
           self.moving_variance, new_variance, decay, zero_debias=False)
-
-      self.add_update(mean_update, inputs=inputs)
-      self.add_update(variance_update, inputs=inputs)
+      if context.in_graph_mode():
+        self.add_update(mean_update, inputs=inputs)
+        self.add_update(variance_update, inputs=inputs)
 
     else:
       mean, variance = self.moving_mean, self.moving_variance
@@ -566,7 +577,6 @@ def batch_normalization(inputs,
 BatchNorm = BatchNormalization
 batch_norm = batch_normalization
 
-
 # Helper function
 
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 2b9306e8748..33ba5df7a6e 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1466,12 +1466,15 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
 
-    if tensor.shape.is_fully_defined():
+    # For now, variant types must be created via zeros_like; as we need to
+    # pass the input variant object to the proper zeros callback.
+
+    if tensor.shape.is_fully_defined() and tensor.dtype != dtypes.variant:
       # We can produce a zeros tensor independent of the value of 'tensor',
       # since the shape is known statically.
       return zeros(tensor.shape, dtype=dtype or tensor.dtype, name=name)
 
-    if dtype is not None and dtype != tensor.dtype:
+    if dtype is not None and dtype != tensor.dtype and dtype != dtypes.variant:
       return zeros(
           shape_internal(tensor, optimize=optimize), dtype=dtype, name=name)
     else:
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 59add19a581..0c6096a0755 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -770,7 +770,7 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
     else:
       d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32)
       # d = n(n+1)/2 implies n is:
-      n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.),
+      n = math_ops.cast(0.5 * (math_ops.sqrt(1. + 8. * d) - 1.),
                         dtype=dtypes.int32)
       if validate_args:
         is_valid_input_shape = check_ops.assert_equal(
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 64987f93ddb..cb7d409f3bc 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -278,7 +278,7 @@ def _VerifyGeneratedGradients(grads, op):
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
 
 
-def _StopOps(from_ops, pending_count):
+def _StopOps(from_ops, stop_gradient_ops, pending_count):
   """The set of ops that terminate the gradient computation.
 
   This computes the frontier of the forward graph *before* which backprop
@@ -288,8 +288,11 @@ def _StopOps(from_ops, pending_count):
   `_PendingCount(g, xs, from_ops)`. An 'op' has predecessors in `from_ops`
   iff pending_count[op._id] > 0.
 
+  In addition, none of `stop_gradient_ops` will be differentiated.
+
   Args:
     from_ops: list of Operations.
+    stop_gradient_ops: list of Operations never to backprop through.
     pending_count: List of integers, indexed by operation id.
 
   Returns:
@@ -304,6 +307,7 @@ def _StopOps(from_ops, pending_count):
         break
     if is_stop_op:
       stop_ops.add(op._id)
+  stop_ops.update(op._id for op in stop_gradient_ops)  # pylint: disable=protected-access
   return stop_ops
 
 
@@ -374,17 +378,17 @@ def gradients(ys,
               name="gradients",
               colocate_gradients_with_ops=False,
               gate_gradients=False,
-              aggregation_method=None):
-  """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.
+              aggregation_method=None,
+              stop_gradients=None):
+  """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
 
   `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
   is a list of `Tensor`, holding the gradients received by the
   `ys`. The list must be the same length as `ys`.
 
-  `gradients()` adds ops to the graph to output the partial
-  derivatives of `ys` with respect to `xs`.  It returns a list of
-  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
-  for y in `ys`.
+  `gradients()` adds ops to the graph to output the derivatives of `ys` with
+  respect to `xs`.  It returns a list of `Tensor` of length `len(xs)` where
+  each tensor is the `sum(dy/dx)` for y in `ys`.
 
   `grad_ys` is a list of tensors of the same length as `ys` that holds
   the initial gradients for each y in `ys`.  When `grad_ys` is None,
@@ -394,6 +398,31 @@ def gradients(ys,
   one wanted to weight the gradient differently for each value in
   each y).
 
+  `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
+  with respect to all `xs`. These tensors will not be backpropagated through,
+  as though they had been explicitly disconnected using `stop_gradient`.  Among
+  other things, this allows computation of partial derivatives as opposed to
+  total derivatives. For example:
+
+    a = tf.constant(0.)
+    b = 2 * a
+    g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
+
+  Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
+  total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
+  influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
+  equivalent to:
+
+    a = tf.stop_gradient(tf.constant(0.))
+    b = tf.stop_gradient(2 * a)
+    g = tf.gradients(a + b, [a, b])
+
+  `stop_gradients` provides a way of stopping gradient after the graph has
+  already been constructed, as compared to `tf.stop_gradient` which is used
+  during graph construction.  When the two approaches are combined,
+  backpropagation stops at both `tf.stop_gradient` nodes and nodes in
+  `stop_gradients`, whichever is encountered first.
+
   Args:
     ys: A `Tensor` or list of tensors to be differentiated.
     xs: A `Tensor` or list of tensors to be used for differentiation.
@@ -407,6 +436,8 @@ def gradients(ys,
       for an operations.  This avoids some race conditions.
     aggregation_method: Specifies the method used to combine gradient terms.
       Accepted values are constants defined in the class `AggregationMethod`.
+    stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
+      through.
 
   Returns:
     A list of `sum(dy/dx)` for each x in `xs`.
@@ -423,12 +454,15 @@ def gradients(ys,
                        "functions in tf.contrib.eager.backprop instead.")
   ys = _AsList(ys)
   xs = _AsList(xs)
+  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
   if grad_ys is None:
     grad_ys = [None] * len(ys)
   else:
     grad_ys = _AsList(grad_ys)
 
-  with ops.name_scope(name, "gradients", ys + xs + grad_ys) as grad_scope:
+  with ops.name_scope(
+      name, "gradients",
+      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
           else x
@@ -450,6 +484,7 @@ def gradients(ys,
       ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
+    stop_gradient_ops = [t.op for t in stop_gradients]
     pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops,
                                               from_ops,
                                               colocate_gradients_with_ops)
@@ -488,8 +523,7 @@ def gradients(ys,
           _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
           queue.append(y.op)
 
-    # The set of 'from_ops'.
-    stop_ops = _StopOps(from_ops, pending_count)
+    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 11c204b5b7f..7a561d046a8 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -349,6 +349,64 @@ class GradientsTest(test_util.TensorFlowTestCase):
       g = gradients.gradients([z, z2], x)
       self.assertAllClose(17502.0, g[0].eval())
 
+  def testPartialDerivatives(self):
+    with self.test_session():
+      x = constant_op.constant(1.)
+      y = 2 * x
+      z = x + y
+      totalg = gradients.gradients(z, [x, y])
+      self.assertEqual([3.0, 1.0], [g.eval() for g in totalg])
+      partialg = gradients.gradients(z, [x, y], stop_gradients=[x, y])
+      self.assertEqual([1.0, 1.0], [g.eval() for g in partialg])
+
+  def testStopGradients(self):
+    def _MakeGraph(rng, stop_gradients=()):
+      def _FunctionOf(xs, k=3):
+        return ops.convert_to_tensor(
+            sum(math_ops.matmul(rng.rand(k, k), x) for x in xs)
+            + rng.rand(k, k))
+
+      a = _FunctionOf([])
+      if "a" in stop_gradients: a = array_ops.stop_gradient(a)
+      b = _FunctionOf([a])
+      if "b" in stop_gradients: b = array_ops.stop_gradient(b)
+      c = _FunctionOf([a, b])
+      if "c" in stop_gradients: c = array_ops.stop_gradient(c)
+      d = _FunctionOf([b, c])
+      if "d" in stop_gradients: d = array_ops.stop_gradient(d)
+      return dict(a=a, b=b, c=c, d=d)
+
+    def _Gradients(ys, xs, **kwargs):
+      dydxs = gradients.gradients(ys, xs, **kwargs)
+      dydxs = [0. * x if dydx is None else dydx
+               for x, dydx in zip(xs, dydxs)]
+      return dydxs
+
+    seed = np.random.randint(1000)
+    cases = []
+    subsets = [""] + "a b c d ab ac ad bc bd cd abc abd acd bcd abcd".split()
+    graph = _MakeGraph(np.random.RandomState(seed))
+    for constants in subsets:
+      graph_with_stops = _MakeGraph(np.random.RandomState(seed), constants)
+      for variables_ in subsets:
+        # compute the gradient when stopped using tf.stop_gradients
+        grad1 = _Gradients([graph_with_stops["d"]],
+                           [graph_with_stops[v] for v in variables_])
+        # compute the gradient when stopped using the stop_gradients kwarg
+        grad2 = _Gradients([graph["d"]],
+                           [graph[v] for v in variables_],
+                           stop_gradients=[graph[v] for v in constants])
+        cases.append(dict(grad1=grad1, grad2=grad2,
+                          constants=constants, variables=variables_))
+
+    # evaluate all tensors in one call to session.run for speed
+    with self.test_session() as session:
+      results = session.run([(case["grad1"], case["grad2"]) for case in cases])
+
+    for (npgrad1, npgrad2), case in zip(results, cases):
+      for a, b in zip(npgrad1, npgrad2):
+        np.testing.assert_allclose(a, b)
+
 
 class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 5cd5d7ba2f3..bd879ac4238 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -37,6 +37,7 @@ See the @{$python/io_ops} guide.
 @@parse_example
 @@parse_single_example
 @@parse_tensor
+@@serialize_tensor
 @@decode_json_example
 @@QueueBase
 @@FIFOQueue
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 3b0a357b164..16320f75849 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -463,10 +463,16 @@ def _confusion_matrix_at_thresholds(
       if include not in all_includes:
         raise ValueError('Invaild key: %s.' % include)
 
-  predictions, labels, weights = _remove_squeezable_dimensions(
-      predictions=math_ops.to_float(predictions),
-      labels=math_ops.cast(labels, dtype=dtypes.bool),
-      weights=weights)
+  with ops.control_dependencies([
+      check_ops.assert_greater_equal(
+          predictions, 0.0, message='predictions must be in [0, 1]'),
+      check_ops.assert_less_equal(
+          predictions, 1.0, message='predictions must be in [0, 1]')
+  ]):
+    predictions, labels, weights = _remove_squeezable_dimensions(
+        predictions=math_ops.to_float(predictions),
+        labels=math_ops.cast(labels, dtype=dtypes.bool),
+        weights=weights)
 
   num_thresholds = len(thresholds)
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index e0e3d08e7ce..bf7c9fac8ed 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -40,6 +40,7 @@ from tensorflow.python.platform import tf_logging
 
 ops.NotDifferentiable("DecodeRaw")
 ops.NotDifferentiable("ParseTensor")
+ops.NotDifferentiable("SerializeTensor")
 ops.NotDifferentiable("StringToNumber")
 
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 1d747f84008..2cae16f44cc 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -19,11 +19,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from autograd import core as ag_core
+
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import custom_gradient
 from tensorflow.python.eager import tape
+from tensorflow.python.eager import tensor_node
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -502,6 +505,8 @@ class ResourceVariable(variables.Variable):
   def sparse_read(self, indices, name=None):
     """Reads the value of this variable sparsely, using `gather`."""
     with ops.name_scope("Gather" if name is None else name) as name:
+      if self._trainable:
+        tape.watch(self._handle)
       value = resource_gather(
           self._handle, indices, dtype=self._dtype, name=name)
     return array_ops.identity(value)
@@ -574,7 +579,14 @@ class ResourceVariable(variables.Variable):
 
     def _run_op(a, *args):
       # pylint: disable=protected-access
-      return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
+      value = a._AsTensor()
+      if ag_core.isnode(value):
+        # This avoids autograd trying to wrap a ResourceVariable.
+        value = ops.convert_to_tensor(value)
+        args = [ops.convert_to_tensor(x) for x in args]
+        return getattr(tensor_node.TensorNode, operator)(value, *args)
+      else:
+        return getattr(ops.Tensor, operator)(value, *args)
 
     # Propagate __doc__ to wrapper
     try:
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index b1626feb27a..25a0ad0a37e 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -28,6 +28,7 @@ import collections
 import hashlib
 import numbers
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -124,9 +125,10 @@ def _zero_state_tensors(state_size, batch_size, dtype):
   def get_state_shape(s):
     """Combine s with batch_size to get a proper tensor shape."""
     c = _concat(batch_size, s)
-    c_static = _concat(batch_size, s, static=True)
     size = array_ops.zeros(c, dtype=dtype)
-    size.set_shape(c_static)
+    if context.in_graph_mode():
+      c_static = _concat(batch_size, s, static=True)
+      size.set_shape(c_static)
     return size
   return nest.map_structure(get_state_shape, state_size)
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 9093c12968f..645775239fd 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1698,7 +1698,7 @@ def variable_scope(name_or_scope,
       use when doing asynchronous distributed training.
 
   Returns:
-    A scope that can be to captured and reused.
+    A scope that can be captured and reused.
 
   Raises:
     ValueError: when trying to reuse within a create scope, or create within
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 53459496646..a1fe47982f0 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -117,7 +117,7 @@ class Profiler(object):
   ```python
   Typical use case:
     # Currently we are only allowed to create 1 profiler per process.
-    profiler = Profile(sess.graph)
+    profiler = Profiler(sess.graph)
 
     for i in xrange(total_steps):
       if i % 10000 == 0:
@@ -174,7 +174,7 @@ class Profiler(object):
     """Add statistics of a step.
 
     Args:
-      step: A step uint64 used to identify the RunMetadata. Must be different
+      step: int, A step used to identify the RunMetadata. Must be different
          across different AddStep() calls.
       run_meta: RunMetadata proto that contains statistics of a session run.
     """
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
index a8712fc37e6..00de044505f 100644
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -51,7 +51,7 @@ def import_to_tensorboard(model_dir, log_dir):
     pb_visual_writer = summary.FileWriter(log_dir)
     pb_visual_writer.add_graph(sess.graph)
     print("Model Imported. Visualize by running: "
-          "> tensorboard --logdir={}".format(log_dir))
+          "tensorboard --logdir={}".format(log_dir))
 
 
 def main(unused_args):
diff --git a/tensorflow/python/training/checkpoint_ops.py b/tensorflow/python/training/checkpoint_ops.py
new file mode 100644
index 00000000000..70460ceb480
--- /dev/null
+++ b/tensorflow/python/training/checkpoint_ops.py
@@ -0,0 +1,453 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for generating and loading vocab remappings."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_checkpoint_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+
+ops.NotDifferentiable("GenerateVocabRemapping")
+ops.NotDifferentiable("LoadAndRemapMatrix")
+
+
+def _load_and_remap_matrix(ckpt_path,
+                           old_tensor_name,
+                           new_row_vocab_offset,
+                           num_rows_to_load,
+                           new_col_vocab_size,
+                           initializer,
+                           old_row_vocab_file=None,
+                           new_row_vocab_file=None,
+                           old_col_vocab_file=None,
+                           new_col_vocab_file=None,
+                           num_row_oov_buckets=0,
+                           num_col_oov_buckets=0,
+                           max_rows_in_memory=-1):
+  """Loads a 2-D (matrix) `Tensor` from checkpoint.
+
+  Generates 1D-remappings for rows and columns using the
+  `GenerateVocabRemapping` op, and initializes any anticipated values with the
+  provided initializer. Then, uses the `LoadAndRemapMatrix` op to create a
+  matrix that loads existing values from the checkpoint, while filling out
+  "missing" values with the newly initialized values. See
+  contrib/framework/ops/checkpoint_ops.cc for more information on the wrapped
+  functionality (LoadAndRemapMatrix). This wrapper can be used to perform only
+  row remapping or only col remapping. If only row remapping is desired,
+  {new,old}_col_vocab_file should be `None`, and vice versa for column
+  remapping.
+
+  NOTE: This only supports div-partitioning the vocabulary on the 1st dimension
+  (row axis) via `new_row_vocab_offset`.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+    new_row_vocab_offset: A 0-indexed integer representing what line to
+      start reading at in the new row vocabulary. Used for partitioned
+      variables.
+    num_rows_to_load: Number of rows to load for the new vocabulary (note: to
+      support variable partitioning and partial loading, this does not need to
+      be the same as the number of entries in `new_row_vocab_file`).
+    new_col_vocab_size: Number of columns to load - should be the same as the
+      number of entries in `new_col_vocab_file`, since we don't support
+      partitioning along the column axis.
+    initializer: Callable initializer function that accepts a 1-D tensor as the
+      arg to specify the shape of the returned tensor. Used to initialize
+      missing values.
+    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old row vocabulary file. Can be None, which represents no
+      remapping on the row axis.
+    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new row vocabulary file. Can be None, which represents no remapping
+      on the row axis - in which case, `new_row_vocab_offset` and
+      `num_rows_to_load` work under the assumption that the new row vocab is the
+      same as the old row vocab.
+    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new column vocabulary file. Can be None, which represents no
+      remapping on the column axis - in which case, `new_col_vocab_size` works
+      under the assumption that the new col vocab is the same as the old col
+      vocab.
+    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
+      to append. Must be >= 0.
+    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
+      columns to append. Must be >= 0.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A Tensor of shape `[num_rows_to_load + num_row_oov_buckets,
+    new_col_vocab_size + num_col_oov_buckets]`, with values loaded from the
+    specified tensor in the checkpoint, and any missing or OOV values
+    initialized with the given `initializer`.
+
+  Raises:
+    ValueError: If `num_row_oov_buckets` or `num_col_oov_buckets` < 0.
+    ValueError: If either `old_row_vocab_file` or `new_row_vocab_file` is
+      provided, while the other is not. Same for `old_col_vocab_file` and
+      `new_col_vocab_file`.
+    ValueError: If neither row vocabs or col vocabs are provided.
+  """
+  if num_row_oov_buckets < 0:
+    raise ValueError("num_row_oov_buckets must be >= 0, but received %d" %
+                     num_row_oov_buckets)
+  if num_col_oov_buckets < 0:
+    raise ValueError("num_col_oov_buckets must be >= 0, but received %d" %
+                     num_col_oov_buckets)
+
+  if bool(old_row_vocab_file) != bool(new_row_vocab_file):
+    raise ValueError(
+        "old_row_vocab_file and new_row_vocab_file must both be specified or "
+        "left unspecified. old_row_vocab_file='{}', new_row_vocab_file='{}'".
+        format(old_row_vocab_file, new_row_vocab_file))
+  if bool(old_col_vocab_file) != bool(new_col_vocab_file):
+    raise ValueError(
+        "old_col_vocab_file and new_col_vocab_file must both be specified or "
+        "left unspecified. old_col_vocab_file='{}', new_col_vocab_file='{}'".
+        format(old_col_vocab_file, new_col_vocab_file))
+
+  remap_rows = new_row_vocab_file and old_row_vocab_file
+  remap_cols = new_col_vocab_file and old_col_vocab_file
+  if not (remap_rows or remap_cols):
+    raise ValueError(
+        "Must provide either row or column vocab files. If no remapping is "
+        "necessary, consider using `tf.contrib.framework.init_from_checkpoint` "
+        "instead.")
+
+  num_rows_present = num_rows_to_load
+  if remap_rows:
+    row_remapping, num_rows_present = (
+        gen_checkpoint_ops._generate_vocab_remapping(  # pylint: disable=protected-access
+            new_vocab_file=new_row_vocab_file,
+            old_vocab_file=old_row_vocab_file,
+            new_vocab_offset=new_row_vocab_offset,
+            num_new_vocab=num_rows_to_load))
+  else:
+    # Even when the rows are not being reordered, we still need to generate a
+    # remapping to account for initializing partitioned Variables (when
+    # new_row_vocab_offset is non-zero).
+    row_remapping = math_ops.range(
+        new_row_vocab_offset,
+        new_row_vocab_offset + num_rows_to_load,
+        dtype=dtypes.int64)
+
+  col_remapping = []
+  num_cols_present = new_col_vocab_size
+  if remap_cols:
+    col_remapping, num_cols_present = (
+        gen_checkpoint_ops._generate_vocab_remapping(  # pylint: disable=protected-access
+            new_vocab_file=new_col_vocab_file,
+            old_vocab_file=old_col_vocab_file,
+            new_vocab_offset=0,  # Offset is unused for cols (no partitioning).
+            num_new_vocab=new_col_vocab_size))
+
+  init_vals = initializer([
+      num_rows_to_load * new_col_vocab_size -
+      num_rows_present * num_cols_present, 1
+  ])
+  return_tensor = gen_checkpoint_ops._load_and_remap_matrix(  # pylint: disable=protected-access
+      ckpt_path=ckpt_path,
+      old_tensor_name=old_tensor_name,
+      row_remapping=row_remapping,
+      col_remapping=col_remapping,
+      initializing_values=init_vals,
+      num_rows=num_rows_to_load,
+      num_cols=new_col_vocab_size,
+      max_rows_in_memory=max_rows_in_memory)
+
+  # Add OOV row(s) and column(s).
+  if num_row_oov_buckets > 0:
+    init_row_oov_val = initializer([num_row_oov_buckets, new_col_vocab_size])
+    init_row_oov_val = ops.convert_to_tensor(init_row_oov_val)
+    return_tensor = array_ops.concat([return_tensor, init_row_oov_val], 0)
+  if num_col_oov_buckets > 0:
+    # We need to add any row OOV to the new column shape.
+    init_col_oov_val = initializer(
+        [num_rows_to_load + num_row_oov_buckets, num_col_oov_buckets])
+    init_col_oov_val = ops.convert_to_tensor(init_col_oov_val)
+    return_tensor = array_ops.concat([return_tensor, init_col_oov_val], 1)
+
+  return return_tensor
+
+
+def _load_and_remap_matrix_initializer(ckpt_path,
+                                       old_tensor_name,
+                                       new_row_vocab_size,
+                                       new_col_vocab_size,
+                                       old_row_vocab_file=None,
+                                       new_row_vocab_file=None,
+                                       old_col_vocab_file=None,
+                                       new_col_vocab_file=None,
+                                       num_row_oov_buckets=0,
+                                       num_col_oov_buckets=0,
+                                       initializer=None,
+                                       max_rows_in_memory=-1):
+  r"""Returns a var initializer for loading and remapping a 2-D (matrix) tensor.
+
+  The returned initializer loads a 2-D (matrix) `Tensor` with name
+  `old_tensor_name` from the checkpoint at `ckpt_path`. It will reorder the
+  rows/columns according to the specified vocab files and append additional
+  out-of-vocabulary rows/columns according to the number of OOV buckets.
+
+  The format of the file at the `{old,new}_{row,col}_vocab_file` path should be
+  a text file, with each line containing a single entity within the vocabulary.
+  Let the function `line_of(f, "x")` return the 0-indexed line number of the
+  entity "x" in file f, and the function `entity_at(f, i)` return the entity at
+  line i of file f. Then, row i of the new output matrix will be taken from row
+  `line_of(old_row_vocab_file, entity_at(new_row_vocab_file, i))` of the old
+  matrix. If any entity in `new_row_vocab_file` is not found in
+  `old_row_vocab_file`, that row is considered a "missing" row, and its values
+  will be initialized using the `initializer` arg. The same logic also applies
+  for the columns.
+
+  For example, assuming that:
+
+  * `old_row_vocab_file` contains "mercury\nvenus\nmars"
+  * `new_row_vocab_file` contains "venus\njupiter\nmercury"
+  * `old_col_vocab_file` contains "good\nbetter\nbest"
+  * `new_col_vocab_file` contains "good\nbest\nfantastic"
+  * `initializer` returns the natural numbers `[1, 2, 3, 4, ...]`
+  * `w(i, j)` represents the value from row i, column j of the old matrix
+
+  Then the new output matrix will look like:
+
+  `[[w(1, 0), w(1, 2), 1],
+    [2,       3,       4],
+    [w(0, 0), w(0, 2), 5]]`
+
+  If we further specify that:
+
+  * `num_row_oov_buckets` == 2
+  * `num_col_oov_buckets` == 1
+
+  Then the new output matrix will look like:
+
+  `[[w(1, 0), w(1, 2), 1,  12],
+    [2,       3,       4,  13],
+    [w(0, 0), w(0, 2), 5,  14],
+    [6,       7,       8,  15],
+    [9,       10,      11, 16]]`
+
+  If `{old,new}_row_vocab_file` are None, we assume that the old and new row
+  vocab files are the same, and no row remapping is done. If
+  `{old,new}_col_vocab_file` are None, we assume that the old and new column
+  vocab files are the same, and no column remapping is done.
+
+  The returned initializer only supports div-partitioning along the row axis. It
+  does not support partitioning along the column axis or mod-partitioning.
+
+  NOTE: When this is used to warm-start variables, client code should use
+  `tf.lookup.index_table_from_tensor()` like
+  contrib/layers/python/layers/feature_column.py does, as opposed to
+  `tf.feature_to_id()` - in order to ensure the underlying lookup tables are the
+  same.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+    new_row_vocab_size: `int` specifying the number of entries in
+      `new_row_vocab_file`. If no row remapping is needed (no row vocab
+      provided), this should be equal to the number of rows to load from the old
+      matrix (which can theoretically be smaller than the number of rows in the
+      old matrix).
+    new_col_vocab_size: `int` specifying the number of entries in
+      `new_col_vocab_file`. If no column remapping is needed (no column vocab
+      provided), this should be equal to the number of columns in the old
+      matrix.
+    old_row_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old row vocabulary file. Can be None, which represents no
+      remapping on the row axis.
+    new_row_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new row vocabulary file. Can be None, which represents no remapping
+      on the row axis.
+    old_col_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    new_col_vocab_file: A scalar `Tensor` of type `string` containing the path
+      to the new column vocabulary file. Can be None, which represents no
+      remapping on the column axis.
+    num_row_oov_buckets: `int` specifying the number of out-of-vocabulary rows
+      to append. Must be >= 0.
+    num_col_oov_buckets: `int` specifying the number of out-of-vocabulary
+      columns to append. Must be >= 0.
+    initializer: Initializer function to initialize missing values. Accepts a
+      1-D tensor as the arg to specify the shape of the returned tensor. If
+      `None`, defaults to using `zeros_initializer()`.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A variable initializer function that should be used to initialize a
+    (potentially partitioned) `Variable` whose complete shape is
+    `[new_row_vocab_size + num_row_oov_buckets, new_col_vocab_size +
+    num_col_oov_buckets]`.
+
+  Raises:
+    TypeError: If `initializer` is specified but not callable.
+  """
+  if initializer is None:
+    # TODO(b/25671353): Consider using sqrt(6/(fan_in + fan_out)) instead, from
+    # Glorot and Bengio, 2010.
+    initializer = init_ops.zeros_initializer()
+
+  if not callable(initializer):
+    raise TypeError(
+        "initializer must be callable, instead of being {} of type {}.".format(
+            initializer, type(initializer)))
+
+  def _initializer(shape, dtype=dtypes.float32, partition_info=None):
+    """Variable initializer.
+
+    Args:
+      shape: Shape of `Tensor` to return. Should include OOV on both axes.
+      dtype: Must be float32.
+      partition_info: variable_scope._PartitionInfo.
+
+    Returns:
+      `Tensor` of shape `shape`.
+
+    Raises:
+      TypeError: If `dtype` is anything other than float32.
+      ValueError: For shape mismatch upon invocation.
+    """
+    # Sanity checks.
+    if dtype != dtypes.float32:
+      raise TypeError(
+          "Currently, only float32 is supported. Received dtype: {}".format(
+              dtype))
+    if len(shape) != 2:
+      raise ValueError("Expected 2-dim shape, but received: {}".format(shape))
+    if shape[0] <= 0:
+      raise ValueError(
+          "Expected 1st dim of shape to be > 0, but received shape: {}".format(
+              shape))
+    if shape[1] != (new_col_vocab_size + num_col_oov_buckets):
+      raise ValueError(
+          "Expected 2nd dim of shape to be new_col_vocab_size ({}) + "
+          "num_col_oov_buckets ({}) = {}, but received shape: {}".format(
+              new_col_vocab_size, num_col_oov_buckets,
+              new_col_vocab_size + num_col_oov_buckets, shape))
+
+    offset = 0
+    if partition_info is not None:
+      offset = partition_info.single_offset(shape)
+
+    if offset + shape[0] > new_row_vocab_size + num_row_oov_buckets:
+      raise ValueError(
+          "Trying to initialize {} additional rows after {} rows have already "
+          "been initialized, which would exceed expected total row count of "
+          "new_row_vocab_size ({}) + num_row_oov_buckets ({}) = {}.".format(
+              shape[0], offset, new_row_vocab_size, num_row_oov_buckets,
+              new_row_vocab_size + num_row_oov_buckets))
+
+    row_oov_buckets_to_use = min(shape[0],
+                                 max(0, offset + shape[0] - new_row_vocab_size))
+    num_rows_to_load = shape[0] - row_oov_buckets_to_use
+
+    return _load_and_remap_matrix(
+        ckpt_path=ckpt_path,
+        old_tensor_name=old_tensor_name,
+        new_row_vocab_offset=offset,
+        num_rows_to_load=num_rows_to_load,
+        new_col_vocab_size=new_col_vocab_size,
+        initializer=initializer,
+        old_row_vocab_file=old_row_vocab_file,
+        new_row_vocab_file=new_row_vocab_file,
+        old_col_vocab_file=old_col_vocab_file,
+        new_col_vocab_file=new_col_vocab_file,
+        num_row_oov_buckets=row_oov_buckets_to_use,
+        num_col_oov_buckets=num_col_oov_buckets,
+        max_rows_in_memory=max_rows_in_memory)
+
+  return _initializer
+
+
+def _load_embedding_initializer(ckpt_path,
+                                embedding_tensor_name,
+                                new_vocab_size,
+                                embedding_dim,
+                                old_vocab_file,
+                                new_vocab_file,
+                                num_oov_buckets=0,
+                                initializer=None,
+                                max_rows_in_memory=-1):
+  """Returns a variable initializer for loading pre-trained embeddings.
+
+  Wrapper around `load_and_remap_matrix_initializer()` specialized for loading
+  embedding weights and remapping according to the provided vocab files. See
+  docs for `load_and_remap_matrix_initializer()` for more details.
+
+  NOTE: Only for use with div-partitioned variables / vocabularies.
+
+  Args:
+    ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`)
+      from which the old matrix `Tensor` will be loaded.
+    embedding_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+    new_vocab_size: Number of entries in the new vocab.
+    embedding_dim: `int` specifying the dimension of the embedding vectors from
+      the checkpoint. Must match the number of columns in the old embedding
+      matrix.
+    old_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the old vocabulary file.
+    new_vocab_file: A scalar `Tensor` of type `string` containing the
+      path to the new vocabulary file.
+    num_oov_buckets: `int` specifying the number of out-of-vocabulary
+      buckets to use. Must be >= 0.
+    initializer: Initializer function that accepts a 1-D tensor as the arg to
+      specify the shape of the returned tensor. If `None`, defaults to using
+      `truncated_normal_initializer()`.
+    max_rows_in_memory: `int` specifying the maximum number of rows to load from
+      the checkpoint at once. If less than or equal to 0, the entire matrix will
+      be loaded into memory. Setting this arg trades increased disk reads for
+      lower memory usage.
+
+  Returns:
+    A variable initializer function.
+  """
+  if initializer is None:
+    # TODO(b/25671353): This should be kept in sync with the stddev used by
+    # feature_column.py's _EmbeddingColumn.
+    initializer = init_ops.truncated_normal_initializer(
+        stddev=1.0 / math.sqrt(embedding_dim))
+
+  return _load_and_remap_matrix_initializer(
+      ckpt_path=ckpt_path,
+      old_tensor_name=embedding_tensor_name,
+      new_row_vocab_size=new_vocab_size,
+      new_col_vocab_size=embedding_dim,
+      old_row_vocab_file=old_vocab_file,
+      new_row_vocab_file=new_vocab_file,
+      old_col_vocab_file=None,
+      new_col_vocab_file=None,
+      num_row_oov_buckets=num_oov_buckets,
+      num_col_oov_buckets=0,
+      initializer=initializer,
+      max_rows_in_memory=max_rows_in_memory)
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
new file mode 100644
index 00000000000..39c4d2911f2
--- /dev/null
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -0,0 +1,305 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Python wrappers around warm-starting."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_ops
+from tensorflow.python.training import saver as saver_lib
+
+
+class LoadAndRemapWrappersTest(test.TestCase):
+  """Tests for the functionality of the Python wrappers."""
+
+  def setUp(self):
+    ops.reset_default_graph()
+    # Create the checkpoint file in a temporary directory.
+    checkpoint_prefix = os.path.join(self.get_temp_dir(), 'model')
+    # 0., 1., ..., 79. reshaped into [5, 16].
+    initializer = init_ops.constant_initializer(
+        np.reshape(np.linspace(0.0, 79, 5 * 16), (5, 16)))
+    with self.test_session() as sess:
+      with variable_scope.variable_scope('some_scope'):
+        variable_scope.get_variable(name='embeddings', shape=[5, 16],
+                                    initializer=initializer)
+      sess.run(variables.global_variables_initializer())
+      saver = saver_lib.Saver()
+      saver.save(sess, checkpoint_prefix, global_step=5)
+    self.checkpoint_file = '{}-5'.format(checkpoint_prefix)
+
+    # Create the vocabulary files.
+    self.new_feature_vocab_file = os.path.join(
+        self.get_temp_dir(), 'new_feature_vocab.txt')
+    with open(self.new_feature_vocab_file, 'w') as f:
+      f.write('\n'.join(['zero', 'one', 'two', 'three', 'four']) + '\n')
+
+    self.old_feature_vocab_file = os.path.join(
+        self.get_temp_dir(), 'old_feature_vocab.txt')
+    with open(self.old_feature_vocab_file, 'w') as f:
+      f.write('\n'.join(['zero', 'one', 'two', 'three']) + '\n')
+
+    self.new_class_vocab_file = os.path.join(
+        self.get_temp_dir(), 'new_class_vocab.txt')
+    with open(self.new_class_vocab_file, 'w') as f:
+      f.write('\n'.join(['MISSING', 'knitting', 'flask', 'eminem']) + '\n')
+
+    self.old_class_vocab_file = os.path.join(
+        self.get_temp_dir(), 'old_class_vocab.txt')
+    with open(self.old_class_vocab_file, 'w') as f:
+      f.write('\n'.join(['knitting', 'eminem', 'MISSING']) + '\n')
+
+    self.init_val = 42
+
+    def _init_val_initializer(shape, dtype=None, partition_info=None):
+      del dtype, partition_info  # Unused by this unit-testing initializer.
+      return array_ops.tile(
+          constant_op.constant([[self.init_val]], dtype=dtypes.float32), shape)
+
+    self.initializer = _init_val_initializer
+
+  def test_load_and_remap_matrix(self):
+    """Tests the end-to-end loading / remapping of weights."""
+    # _load_and_remap_matrix() is the generalized wrapper that takes in row and
+    # column vocabulary files, calls the relevant remappings, and returns the
+    # weight matrix.  Take this example to be linear multi-class by providing
+    # both row and column vocabularies.
+    remapped_matrix = checkpoint_ops._load_and_remap_matrix(
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_rows_to_load=4,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.checkpoint_file],
+        new_row_vocab_offset=1,
+        initializer=self.initializer,
+        num_row_oov_buckets=1,
+        num_col_oov_buckets=1)
+
+    # [4 in vocab + 1 oov features, 4 in vocab + 1 oov classes].  The offset
+    # means we read
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([18, 34, 50, self.init_val, self.init_val], [5, 1]),
+            np.reshape([16, 32, 48, self.init_val, self.init_val], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1]),
+            np.reshape([17, 33, 49, self.init_val, self.init_val], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1])
+        ],
+        axis=1)
+
+    with self.test_session():
+      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+
+  def test_load_and_remap_output_layer_weight_initializer_linear(self):
+    """Tests for the output layer initializer in the linear multi-class case."""
+    loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.checkpoint_file],
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_row_oov_buckets=1,
+        num_col_oov_buckets=1,
+        initializer=self.initializer))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50, self.init_val, self.init_val], [6, 1]),
+            np.reshape([0, 16, 32, 48, self.init_val, self.init_val], [6, 1]),
+            np.reshape([self.init_val] * 6, [6, 1]),
+            np.reshape([1, 17, 33, 49, self.init_val, self.init_val], [6, 1]),
+            np.reshape([self.init_val] * 6, [6, 1])
+        ],
+        axis=1)
+
+    # The new weight matrix is of size
+    # [5 feature vocab + 1 feature OOV, 4 class vocab + 1 class OOV].  Use a
+    # partitioned variable to confirm that the offset logic works.
+    remapped_matrix = variable_scope.get_variable(
+        name='linear/obtained_weight_matrix',
+        shape=[6, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_load_and_remap_output_layer_weight_initializer_dnn_output(self):
+    """Tests for the output layer initializer in the DNN output case."""
+    loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.checkpoint_file],
+        num_col_oov_buckets=1,
+        initializer=self.initializer))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50, 66], [5, 1]),
+            np.reshape([0, 16, 32, 48, 64], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1]),
+            np.reshape([1, 17, 33, 49, 65], [5, 1]),
+            np.reshape([self.init_val] * 5, [5, 1])
+        ],
+        axis=1)
+
+    # The new weight matrix is of size
+    # [5-sized input layer, 4 class vocab + 1 class OOV].
+    remapped_matrix = variable_scope.get_variable(
+        name='dnn_output/obtained_weight_matrix',
+        shape=[5, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_initializer_with_oov_only_partition(self):
+    """Tests for the output layer initializer where one partition is all OOV."""
+    loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.checkpoint_file],
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_row_oov_buckets=5,
+        num_col_oov_buckets=1,
+        initializer=self.initializer))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50] + [self.init_val] * 6, [10, 1]),
+            np.reshape([0, 16, 32, 48] + [self.init_val] * 6, [10, 1]),
+            np.reshape([self.init_val] * 10, [10, 1]),
+            np.reshape([1, 17, 33, 49] + [self.init_val] * 6, [10, 1]),
+            np.reshape([self.init_val] * 10, [10, 1]),
+        ],
+        axis=1)
+
+    # The new weight matrix is of size
+    # [5 feature vocab + 5 feature OOV, 4 class vocab + 1 class OOV].  The
+    # second partition has only OOV.
+    remapped_matrix = variable_scope.get_variable(
+        name='linear_all_oov/obtained_weight_matrix',
+        shape=[10, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_load_and_remap_linear_multiclass_initializer_default_init(self):
+    """Tests where the zeros_initializer default is used for linear."""
+    loading_initializer = (checkpoint_ops._load_and_remap_matrix_initializer(
+        new_row_vocab_size=5,
+        new_col_vocab_file=self.new_class_vocab_file,
+        old_col_vocab_file=self.old_class_vocab_file,
+        new_col_vocab_size=4,
+        old_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.checkpoint_file],
+        new_row_vocab_file=self.new_feature_vocab_file,
+        old_row_vocab_file=self.old_feature_vocab_file,
+        num_row_oov_buckets=1,
+        num_col_oov_buckets=1))
+
+    expected_remapped_matrix = np.concatenate(
+        [
+            np.reshape([2, 18, 34, 50, 0, 0], [6, 1]),
+            np.reshape([0, 16, 32, 48, 0, 0], [6, 1]),
+            np.reshape([0] * 6, [6, 1]),
+            np.reshape([1, 17, 33, 49, 0, 0], [6, 1]),
+            np.reshape([0] * 6, [6, 1])
+        ],
+        axis=1)
+
+    remapped_matrix = variable_scope.get_variable(
+        name='linear_init_fallback/obtained_weight_matrix',
+        shape=[6, 5],
+        initializer=loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_matrix,
+                          remapped_matrix.as_tensor().eval())
+
+  def test_load_embedding_initializer(self):
+    """Tests for the load_embedding_initializer wrapper."""
+    embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer(
+        new_vocab_file=self.new_feature_vocab_file,
+        old_vocab_file=self.old_feature_vocab_file,
+        new_vocab_size=5,
+        embedding_dim=16,
+        embedding_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.checkpoint_file],
+        num_oov_buckets=1,
+        initializer=self.initializer))
+
+    expected_remapped_embeddings = np.concatenate(
+        [
+            np.reshape(range(64), [4, 16]),
+            np.reshape([self.init_val] * 32, [2, 16]),
+        ],
+        axis=0)
+
+    # The new weight matrix is of size
+    # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
+    # last vocab row (2nd last row) is newly initialized (wasn't found in
+    # previous vocab) and the actual last row is OOV and also newly initialized.
+    # Use a partitioned variable to confirm that the offset logic works.
+    remapped_embeddings = variable_scope.get_variable(
+        name='embedding/obtained_embedding_matrix',
+        shape=[6, 16],
+        initializer=embedding_loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_embeddings,
+                          remapped_embeddings.as_tensor().eval())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index af5341a8d62..bddb87f00cb 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -2,4 +2,5 @@
 *perftools*gputools*
 *tf_*
 *TF_*
+*TFE_*
 *nsync_*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index aeb7d66b321..11f66c5c8b2 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -3,6 +3,7 @@ tensorflow {
     *tensorflow*;
     *perftools*gputools*;
     *TF_*;
+    *TFE_*;
     *nsync_*;
   local:
     *;
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 667ae5cf6e5..8893594dc34 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -1098,7 +1098,7 @@ tf_module {
   }
   member_method {
     name: "gradients"
-    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "greater"
@@ -1684,6 +1684,10 @@ tf_module {
     name: "serialize_sparse"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "serialize_tensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "set_random_seed"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 6f3c3f15103..4405678a6b8 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -277,8 +277,8 @@ def check_for_lingering_string(lingering_string):
   formatted_string = lingering_string.replace(".", r"\.")
   try:
     linger_str_output = subprocess.check_output(
-        ['grep', '-rnoH', formatted_string, TF_SRC_DIR])
-    linger_strs = linger_str_output.decode('utf8').split("\n")
+        ["grep", "-rnoH", formatted_string, TF_SRC_DIR])
+    linger_strs = linger_str_output.decode("utf8").split("\n")
   except subprocess.CalledProcessError:
     linger_strs = []
 
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 34cb19f9cb7..d62316964f8 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -158,6 +158,7 @@ sh_binary(
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
+            "//tensorflow/contrib/receptive_field:receptive_field_pip",
             "//tensorflow/contrib/session_bundle:session_bundle_pip",
             "//tensorflow/contrib/signal:signal_py",
             "//tensorflow/contrib/slim:slim",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2c7acd809a8..ef342fe1272 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -573,11 +573,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   patched_http_archive(
       name = "boringssl",
       urls = [
-          "http://mirror.bazel.build/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
-          "https://github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",  # 2016-07-11
+          "http://mirror.bazel.build/github.com/google/boringssl/archive/e3860009a091cd1bd2bc189cdbc3c6d095abde84.tar.gz",
+          "https://github.com/google/boringssl/archive/e3860009a091cd1bd2bc189cdbc3c6d095abde84.tar.gz",  # 2017-07-07
       ],
-      sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
-      strip_prefix = "boringssl-bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",
+      sha256 = "02f5950f93c4fd3691771c07c9d04cf2999ab01383ff99da345249e93b0fcfb2",
+      strip_prefix = "boringssl-e3860009a091cd1bd2bc189cdbc3c6d095abde84",
       # Add patch to boringssl code to support s390x
       patch_file = str(Label("//third_party/boringssl:add_boringssl_s390x.patch")),
   )
@@ -683,11 +683,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.new_http_archive(
       name = "cub_archive",
       urls = [
-          "http://mirror.bazel.build/github.com/NVlabs/cub/archive/69ceda618313df8e9cac6659d607b08949455d14.tar.gz",
-          "https://github.com/NVlabs/cub/archive/69ceda618313df8e9cac6659d607b08949455d14.tar.gz",
+          "http://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.3.zip",
+          "https://github.com/NVlabs/cub/archive/1.7.3.zip",
       ],
-      sha256 = "87e856522c283b8ea887c3b61d7d5b252d2dd74abac4f1d756d776e721223e82",
-      strip_prefix = "cub-69ceda618313df8e9cac6659d607b08949455d14",
+      sha256 = "b7ead9e291d34ffa8074243541c1380d63be63f88de23de8ee548db573b72ebe",
+      strip_prefix = "cub-1.7.3",
       build_file = str(Label("//third_party:cub.BUILD")),
   )
 
@@ -699,9 +699,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   native.http_archive(
       name = "bazel_toolchains",
       urls = [
-          "http://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/bccee4855c049d34bac481083b4c68e2fab8cc50.tar.gz",
-          "https://github.com/bazelbuild/bazel-toolchains/archive/bccee4855c049d34bac481083b4c68e2fab8cc50.tar.gz",
+          "http://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/9dbd803ad3b9447430a296810197b09b3a710956.tar.gz",
+          "https://github.com/bazelbuild/bazel-toolchains/archive/9dbd803ad3b9447430a296810197b09b3a710956.tar.gz",
       ],
-      sha256 = "3903fd93b96b42067e00b7973a2c16c34e761ad7a0b55e1557d408f352849e41",
-      strip_prefix = "bazel-toolchains-bccee4855c049d34bac481083b4c68e2fab8cc50",
+      sha256 = "0799aa12db5260a499beb40f81744e760c59d055bfc5d271dd2c2ed4d5419faa",
+      strip_prefix = "bazel-toolchains-9dbd803ad3b9447430a296810197b09b3a710956",
   )
diff --git a/third_party/boringssl/add_boringssl_s390x.patch b/third_party/boringssl/add_boringssl_s390x.patch
index 9a34a59a1d1..8b42d10e687 100644
--- a/third_party/boringssl/add_boringssl_s390x.patch
+++ b/third_party/boringssl/add_boringssl_s390x.patch
@@ -3,9 +3,9 @@ index 7a3adfb..88012ad 100644
 --- a/src/include/openssl/base.h
 +++ b/src/include/openssl/base.h
 @@ -94,6 +94,8 @@ extern "C" {
- #elif defined(__pnacl__)
- #define OPENSSL_32_BIT
  #define OPENSSL_PNACL
+ #elif defined(__myriad2__)
+ #define OPENSSL_32_BIT
 +#elif defined(__s390x__)
 +#define OPENSSL_64_BIT
  #else
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index 7d8b6005135..98cb326572e 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -12,12 +12,12 @@ cc_toolchain_suite(
 
 cc_toolchain(
     name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    all_files = "%{linker_files}",
     compiler_files = ":empty",
     cpu = "local",
     dwp_files = ":empty",
     dynamic_runtime_libs = [":empty"],
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    linker_files = "%{linker_files}",
     objcopy_files = ":empty",
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
@@ -30,12 +30,12 @@ cc_toolchain(
 
 cc_toolchain(
     name = "cc-compiler-darwin",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    all_files = "%{linker_files}",
     compiler_files = ":empty",
     cpu = "darwin",
     dwp_files = ":empty",
     dynamic_runtime_libs = [":empty"],
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    linker_files = "%{linker_files}",
     objcopy_files = ":empty",
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index b85e565f362..4a0f4710881 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -971,7 +971,6 @@ def _create_local_cuda_repository(repository_ctx):
                                '        ":cudnn-include",')
        })
   # Set up crosstool/
-  _file(repository_ctx, "crosstool:BUILD")
   cc = find_cc(repository_ctx)
   host_compiler_includes = _host_compiler_includes(repository_ctx, cc)
   cuda_defines = {
@@ -981,11 +980,14 @@ def _create_local_cuda_repository(repository_ctx):
        }
   if _use_cuda_clang(repository_ctx):
     cuda_defines["%{clang_path}"] = cc
+    _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty"})
     _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
   else:
     nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
         (cuda_config.cuda_toolkit_path,
         ".exe" if cuda_config.cpu_value == "Windows" else "")))
+    _tpl(repository_ctx, "crosstool:BUILD",
+         {"%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc"})
     _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL")
     _tpl(repository_ctx,
          "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",