From 9b9e5989d247d274c4137db533e43b95d825acfc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 17:03:26 -0700
Subject: [PATCH 01/70] Add a call_logit_fn utility for logit_fn's, similar to
 Estimator's _call_model_fn.

PiperOrigin-RevId: 165649388
---
 tensorflow/contrib/learn/BUILD                | 12 ++++
 .../learn/python/learn/estimators/__init__.py |  1 +
 .../python/learn/estimators/logit_fns.py      | 39 +++++++++++-
 .../python/learn/estimators/logit_fns_test.py | 60 +++++++++++++++++++
 4 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/learn/python/learn/estimators/logit_fns_test.py

diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 978ebfef77f..c2e74d1cc2e 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -119,6 +119,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "logit_fns_test",
+    size = "small",
+    srcs = ["python/learn/estimators/logit_fns_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
+
 py_test(
     name = "estimators_test",
     size = "small",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index 42943fdd3ac..9d63d7dcd0b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -321,6 +321,7 @@ from tensorflow.contrib.learn.python.learn.estimators.linear import LinearClassi
 from tensorflow.contrib.learn.python.learn.estimators.linear import LinearEstimator
 from tensorflow.contrib.learn.python.learn.estimators.linear import LinearRegressor
 from tensorflow.contrib.learn.python.learn.estimators.logistic_regressor import LogisticRegressor
+from tensorflow.contrib.learn.python.learn.estimators.logit_fns import call_logit_fn
 from tensorflow.contrib.learn.python.learn.estimators.logit_fns import dnn_logit_fn_builder
 from tensorflow.contrib.learn.python.learn.estimators.logit_fns import linear_logit_fn_builder
 from tensorflow.contrib.learn.python.learn.estimators.metric_key import MetricKey
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logit_fns.py b/tensorflow/contrib/learn/python/learn/estimators/logit_fns.py
index f04a47b29af..110ea0302e7 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logit_fns.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logit_fns.py
@@ -21,7 +21,7 @@ should follow the following signature:
 Args:
 `features`: This is the first item returned from the `input_fn` passed to
             `train`, `evaluate`, and `predict`. This should be a single
-            `Tensor` or `dict` of same.
+            `Tensor` or `dict` of same, and is the only required argument.
 `mode`: Optional. Specifies if this training, evaluation or prediction. See
         `ModeKeys`.
 `params`: Optional `dict` of hyperparameters.  Will receive what is passed to
@@ -39,10 +39,47 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import dnn as dnn_core
 from tensorflow.python.estimator.canned import linear as linear_core
+from tensorflow.python.framework import ops
 
 # pylint: disable=protected-access
 dnn_logit_fn_builder = dnn_core._dnn_logit_fn_builder
 linear_logit_fn_builder = linear_core._linear_logit_fn_builder
 # pylint: enable=protected-access
+
+
+def call_logit_fn(logit_fn, features, mode, params, config):
+  """Calls logit_fn.
+
+  A utility function that calls the provided logit_fn with the relevant subset
+  of provided arguments.  Similar to tf.estimator._call_model_fn().
+
+  Args:
+    logit_fn: A logit_fn as defined above.
+    features: The features dict.
+    mode: TRAIN / EVAL / PREDICT ModeKeys.
+    params: The hyperparameter dict.
+    config: The configuration object.
+
+  Returns:
+    A logit Tensor, the output of logit_fn.
+
+  Raises:
+    ValueError: if logit_fn does not return a Tensor.
+  """
+  logit_fn_args = util.fn_args(logit_fn)
+  kwargs = {}
+  if 'mode' in logit_fn_args:
+    kwargs['mode'] = mode
+  if 'params' in logit_fn_args:
+    kwargs['params'] = params
+  if 'config' in logit_fn_args:
+    kwargs['config'] = config
+  logit_fn_results = logit_fn(features=features, **kwargs)
+
+  if not isinstance(logit_fn_results, ops.Tensor):
+    raise ValueError('model_fn should return a Tensor.')
+
+  return logit_fn_results
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logit_fns_test.py b/tensorflow/contrib/learn/python/learn/estimators/logit_fns_test.py
new file mode 100644
index 00000000000..01616d1a7ff
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/logit_fns_test.py
@@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""logit_fn tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators import logit_fns
+from tensorflow.python.client import session
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+
+
+class LogitFnTest(test.TestCase):
+
+  def test_simple_call_logit_fn(self):
+    def dummy_logit_fn(features, mode):
+      if mode == model_fn.ModeKeys.TRAIN:
+        return features['f1']
+      else:
+        return features['f2']
+    features = {
+        'f1': constant_op.constant([2., 3.]),
+        'f2': constant_op.constant([4., 5.])
+    }
+    logit_fn_result = logit_fns.call_logit_fn(
+        dummy_logit_fn, features, model_fn.ModeKeys.EVAL, 'fake_params',
+        'fake_config')
+    with session.Session():
+      self.assertAllClose([[4., 5.]], logit_fn_result.eval())
+
+  def test_should_return_tensor(self):
+
+    def invalid_logit_fn(features, params):
+      return {
+          'tensor1': features['f1'] * params['input_multiplier'],
+          'tensor2': features['f2'] * params['input_multiplier']
+      }
+    features = {
+        'f1': constant_op.constant([2., 3.]),
+        'f2': constant_op.constant([4., 5.])
+    }
+    params = {'learning_rate': 0.001, 'input_multiplier': 2.0}
+    with self.assertRaisesRegexp(ValueError, 'model_fn should return a Tensor'):
+      logit_fns.call_logit_fn(invalid_logit_fn, features, 'fake_mode', params,
+                              'fake_config')

From a3c4e980e00e9c332a4e9f8c232fb2a1cc2f5694 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 17 Aug 2017 17:05:12 -0700
Subject: [PATCH 02/70] Fixed input shape for freezing audio graphs

PiperOrigin-RevId: 165649546
---
 tensorflow/examples/speech_commands/freeze.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index 381f3d029e5..6d2f2102625 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -90,9 +90,14 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
       spectrogram,
       decoded_sample_data.sample_rate,
       dct_coefficient_count=dct_coefficient_count)
+  fingerprint_frequency_size = model_settings['dct_coefficient_count']
+  fingerprint_time_size = model_settings['spectrogram_length']
+  reshaped_input = tf.reshape(fingerprint_input, [
+      -1, fingerprint_time_size * fingerprint_frequency_size
+  ])
 
   logits = models.create_model(
-      fingerprint_input, model_settings, model_architecture, is_training=False)
+      reshaped_input, model_settings, model_architecture, is_training=False)
 
   # Create an output to use for inference.
   tf.nn.softmax(logits, name='labels_softmax')

From 8c0853db731cf80cfeec9dfb4edab95961aaa585 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 17:09:29 -0700
Subject: [PATCH 03/70] Add a test for negative and zero pow() input.

PiperOrigin-RevId: 165650096
---
 .../compiler/xla/tests/array_elementwise_ops_test.cc  | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 192477555d0..532e2394c0d 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -785,6 +785,17 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
       &builder, {16.0f, 0.25f, 8.0f, NAN, NAN, -8.0f, 16.0f}, {}, error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, PowNonIntegerF32s) {
+  SetFastMathDisabled(true);
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<float>({-2.0f, -0.6f, -0.6f, 0.0f});
+  auto rhs = builder.ConstantR1<float>({0.5f, 0.6f, -0.6f, -0.6f});
+  auto minimum = builder.Pow(lhs, rhs);
+
+  ComputeAndCompareR1<float>(&builder, {NAN, NAN, NAN, INFINITY}, {},
+                             error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<float>({});

From 19a55725af8102d72d4e081c5139f0e4bd5a4bb7 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 17 Aug 2017 17:20:17 -0700
Subject: [PATCH 04/70] Allowing functions to run across devices. This change
 expands the ProcessFunctionLibraryRuntime library to Instantiate and Run
 functions on different devices. When a FunctionLibraryRuntime encounters a
 function with a target that is another device, it delegates Instantiate() and
 Run() calls to the ProcessFunctionLibraryRuntime.

This change also moves the table_ containing all function instantiations to the PFLR instead of the FunctionLibraryRuntime.

PiperOrigin-RevId: 165651194
---
 tensorflow/c/eager/c_api.cc                   |  18 +-
 .../jit/encapsulate_subgraphs_pass.cc         |  17 +-
 .../compiler/jit/mark_for_compilation_pass.cc |  11 +-
 tensorflow/compiler/tf2xla/xla_compiler.cc    |  25 ++-
 tensorflow/compiler/tf2xla/xla_compiler.h     |   8 +-
 tensorflow/contrib/cmake/tf_tests.cmake       |   1 +
 .../contrib/data/python/kernel_tests/BUILD    |  22 ++
 .../kernel_tests/iterator_ops_cluster_test.py | 109 ++++++++++
 .../python/kernel_tests/iterator_ops_test.py  |  58 +++++
 tensorflow/core/BUILD                         |  15 +-
 tensorflow/core/common_runtime/function.cc    | 107 ++++++---
 tensorflow/core/common_runtime/function.h     |  14 +-
 .../core/common_runtime/function_test.cc      | 203 ++++++++++--------
 .../core/common_runtime/function_testlib.cc   |  58 +++++
 .../core/common_runtime/function_testlib.h    |  31 +++
 .../process_function_library_runtime.cc       | 107 ++++++++-
 .../process_function_library_runtime.h        |  55 ++++-
 .../process_function_library_runtime_test.cc  | 129 ++++++++++-
 tensorflow/core/framework/function.h          |   8 +
 tensorflow/core/framework/function_testlib.cc |   7 +
 tensorflow/core/framework/function_testlib.h  |  18 ++
 .../core/grappler/grappler_item_builder.cc    |  11 +-
 tensorflow/core/kernels/captured_function.cc  |  32 +--
 tensorflow/core/kernels/captured_function.h   |  13 +-
 tensorflow/core/kernels/function_ops.cc       |  62 ++++++
 tensorflow/core/ops/functional_ops.cc         |  19 ++
 .../kernel_tests/functional_ops_test.py       |  54 +++++
 27 files changed, 1015 insertions(+), 197 deletions(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
 create mode 100644 tensorflow/core/common_runtime/function_testlib.cc
 create mode 100644 tensorflow/core/common_runtime/function_testlib.h

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 22d5f233c31..b1baa5ce125 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -64,19 +64,14 @@ struct TFE_Context {
   // One FunctionLibraryRuntime per device.
   // func_libs[i] is the FunctionLibraryRuntime corresponding to
   // session->devices[i].
-  std::vector<std::unique_ptr<tensorflow::FunctionLibraryRuntime> > func_libs;
+  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
 
   std::unordered_map<tensorflow::Fprint128, tensorflow::KernelAndDevice*,
                      tensorflow::Fprint128Hasher>
       kernel_cache;
 
   tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) {
-    for (int i = 0; i < session->devices.size(); ++i) {
-      if (session->devices[i] == d) {
-        return func_libs[i].get();
-      }
-    }
-    return nullptr;
+    return pflr->GetFLR(d->name());
   }
 
   const std::vector<tensorflow::Device*>& devices() { return session->devices; }
@@ -132,12 +127,9 @@ TFE_Context* TFE_NewContext(const TF_SessionOptions* opts, TF_Status* status) {
   }
 
   TFE_Context* ret = new TFE_Context(session);
-  ret->func_libs.resize(ret->devices().size());
-  for (int i = 0; i < ret->devices().size(); ++i) {
-    ret->func_libs[i] = tensorflow::NewFunctionLibraryRuntime(
-        ret->session->device_mgr, opts->options.env, ret->devices()[i],
-        TF_GRAPH_DEF_VERSION, &ret->func_lib_def, {});
-  }
+  ret->pflr.reset(new tensorflow::ProcessFunctionLibraryRuntime(
+      ret->session->device_mgr, opts->options.env, TF_GRAPH_DEF_VERSION,
+      &ret->func_lib_def, {}));
   ret->rendezvous =
       new tensorflow::IntraProcessRendezvous(ret->session->device_mgr);
 
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index a1ddad3e9b8..22899ebeebc 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -624,15 +624,18 @@ Status EncapsulateSubgraphsPass::Run(
   FunctionLibraryDefinition* const library = options.flib_def;
 
   OptimizerOptions opts;
-  std::unique_ptr<FunctionLibraryRuntime> flr(
-      NewFunctionLibraryRuntime(nullptr, options.session_options->env, nullptr,
-                                TF_GRAPH_DEF_VERSION, library, opts));
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(nullptr, options.session_options->env,
+                                        TF_GRAPH_DEF_VERSION, library, opts));
+  FunctionLibraryRuntime* flr =
+      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
 
-  auto rewrite_subgraph = [&flr](
-      std::unique_ptr<Graph>* subgraph, std::vector<int>* input_permutation,
-      std::vector<int>* output_permutation, NodeDef* node) {
+  auto rewrite_subgraph = [flr](std::unique_ptr<Graph>* subgraph,
+                                std::vector<int>* input_permutation,
+                                std::vector<int>* output_permutation,
+                                NodeDef* node) {
     // Optimize the subgraph.
-    OptimizeGraph(flr.get(), subgraph);
+    OptimizeGraph(flr, subgraph);
 
     const int num_args = input_permutation->size();
     std::vector<bool> const_args(num_args);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 77b45aa11e2..2fe190e605f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -176,8 +176,11 @@ Status FindCompilationCandidates(
     const std::function<bool(const Node*, const DeviceType&)>& is_compilable_fn,
     std::unordered_set<Node*>* candidates) {
   OptimizerOptions opts;
-  std::unique_ptr<FunctionLibraryRuntime> lib_runtime(NewFunctionLibraryRuntime(
-      nullptr, env, nullptr, TF_GRAPH_DEF_VERSION, flib_def, opts));
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(nullptr, env, TF_GRAPH_DEF_VERSION,
+                                        flib_def, opts));
+  FunctionLibraryRuntime* lib_runtime =
+      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
 
   for (Node* node : graph.op_nodes()) {
     DeviceType device_type("");
@@ -191,7 +194,7 @@ Status FindCompilationCandidates(
         XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration));
     DeviceType jit_device_type(registration->compilation_device_name);
     if (!HasXLAKernel(*node, jit_device_type) &&
-        !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime.get())) {
+        !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime)) {
       VLOG(2) << "Compilation rejected node: unsupported op " << node->name()
               << ": " << node->type_string();
       continue;
@@ -203,7 +206,7 @@ Status FindCompilationCandidates(
       continue;
     }
     if (node->type_string() == "While" &&
-        !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime.get())) {
+        !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime)) {
       continue;
     }
     candidates->insert(node);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index d9bfaa93322..ae13147a18e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -88,15 +88,18 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
   }
 
   local_flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(),
+
                                                       FunctionDefLibrary{}));
-  local_flib_runtime_ = NewFunctionLibraryRuntime(
-      &device_mgr_, Env::Default(), device_, options.graph_def_version,
+  local_pflr_.reset(new ProcessFunctionLibraryRuntime(
+      &device_mgr_, Env::Default(), options.graph_def_version,
       local_flib_def_.get(), OptimizerOptions(),
-      nullptr /* custom_kernel_creator */);
-  flib_runtime_ = NewFunctionLibraryRuntime(
-      &device_mgr_, Env::Default(), device_, options.graph_def_version,
-      options.flib_def, OptimizerOptions(),
-      nullptr /* custom_kernel_creator */);
+      nullptr /* custom_kernel_creator */));
+  pflr_.reset(new ProcessFunctionLibraryRuntime(
+      &device_mgr_, Env::Default(), options.graph_def_version, options.flib_def,
+      OptimizerOptions(), nullptr /* custom_kernel_creator */));
+
+  local_flib_runtime_ = local_pflr_->GetFLR(device_->name());
+  flib_runtime_ = pflr_->GetFLR(device_->name());
 }
 
 XlaCompiler::~XlaCompiler() = default;
@@ -137,8 +140,8 @@ Status XlaCompiler::CompileFunction(
   }
 
   const FunctionBody* fbody;
-  if (!GetFunctionBody(function, local_flib_runtime_.get(), &fbody).ok()) {
-    TF_RETURN_IF_ERROR(GetFunctionBody(function, flib_runtime_.get(), &fbody));
+  if (!GetFunctionBody(function, local_flib_runtime_, &fbody).ok()) {
+    TF_RETURN_IF_ERROR(GetFunctionBody(function, flib_runtime_, &fbody));
   }
 
   TF_RETURN_IF_ERROR(CheckSignature(fbody->arg_types, args));
@@ -159,7 +162,7 @@ Status XlaCompiler::CompileFunction(
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
-  optimizer.Optimize(flib_runtime_.get(), flib_runtime_->env(),
+  optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, /*shape_map=*/nullptr);
 
   VLOG(1) << "====================================================";
@@ -464,7 +467,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   context->set_args(std::move(context_args));
 
   TF_RETURN_IF_ERROR(ExecuteGraph(context, std::move(graph), device_,
-                                  flib_runtime_.get(), NextStepId()));
+                                  flib_runtime_, NextStepId()));
 
   int num_nonconst_outputs;
   int num_computation_outputs;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 317f635bcbe..b5987c8ac8b 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -276,7 +276,7 @@ class XlaCompiler {
   xla::Client* client() const { return options_.client; }
   XlaCompilationDevice* device() const { return device_; }
   const DeviceMgr* device_mgr() const { return &device_mgr_; }
-  FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_.get(); }
+  FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; }
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
@@ -303,9 +303,11 @@ class XlaCompiler {
   // library and runtime for functions created as part of the functionalize
   // control flow transformation.
   std::unique_ptr<FunctionLibraryDefinition> local_flib_def_;
-  std::unique_ptr<FunctionLibraryRuntime> local_flib_runtime_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> local_pflr_;
 
-  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+  FunctionLibraryRuntime* local_flib_runtime_;  // owned by local_pflr_.
+  FunctionLibraryRuntime* flib_runtime_;        // owned by pflr_.
 
   struct SignatureHash {
     uint64 operator()(
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 8ed5c154bfd..25f00de81dd 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -241,6 +241,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"  # depends on python/framework/test_ops
       # Broken tensorboard test due to cmake issues.
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"  # Needs portpicker
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py"  # Results in wrong order.
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 25b419557e5..d9a3079b87c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -21,6 +21,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:functional_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:training",
@@ -28,6 +29,27 @@ py_test(
     ],
 )
 
+py_test(
+    name = "iterator_ops_cluster_test",
+    size = "small",
+    srcs = ["iterator_ops_cluster_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "batch_dataset_op_test",
     size = "small",
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
new file mode 100644
index 00000000000..faad6e925d7
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
@@ -0,0 +1,109 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops that need test_util."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.platform import test
+
+
+class IteratorClusterTest(test.TestCase):
+
+  def testRemoteIteratorWithoutRemoteCallFail(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    worker, _ = test_util.create_local_cluster(
+        1, 1, worker_config=worker_config)
+
+    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3_handle = iterator_3.string_handle()
+
+    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
+      remote_it = dataset_ops.Iterator.from_string_handle(
+          iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes)
+      get_next_op = remote_it.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next_op)
+
+  def testRemoteIteratorUsingRemoteCallOp(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    worker, _ = test_util.create_local_cluster(
+        1, 1, worker_config=worker_config)
+
+    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3_handle = iterator_3.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _remote_fn(h):
+      remote_iterator = dataset_ops.Iterator.from_string_handle(
+          h, dataset_3.output_types, dataset_3.output_shapes)
+      return remote_iterator.get_next()
+
+    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
+      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+      remote_op = functional_ops.remote_call(
+          args=[iterator_3_handle],
+          Tout=[dtypes.int32],
+          f=_remote_fn,
+          target=target_placeholder)
+
+    with session.Session(worker[0].target) as sess:
+      elem = sess.run(
+          remote_op,
+          feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"})
+      self.assertEqual(elem, [1])
+      # Fails when target is cpu:0 where the resource is not located.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(
+            remote_op,
+            feed_dict={
+                target_placeholder: "/job:worker/replica:0/task:0/cpu:0"
+            })
+      elem = sess.run(
+          remote_op,
+          feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"})
+      self.assertEqual(elem, [2])
+      elem = sess.run(
+          remote_op,
+          feed_dict={target_placeholder: "/job:worker/replica:0/task:0/cpu:1"})
+      self.assertEqual(elem, [3])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(
+            remote_op,
+            feed_dict={
+                target_placeholder: "/job:worker/replica:0/task:0/cpu:1"
+            })
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
index 30f685842b0..b20742f7758 100644
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -25,8 +25,10 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -416,6 +418,62 @@ class IteratorTest(test.TestCase):
             feedable_int_vector.get_next(),
             feed_dict={handle_placeholder: handle_float_vector}))
 
+  def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3_handle = iterator_3.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _remote_fn(h):
+      remote_iterator = dataset_ops.Iterator.from_string_handle(
+          h, dataset_3.output_types, dataset_3.output_shapes)
+      return remote_iterator.get_next()
+
+    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
+      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+      remote_op = functional_ops.remote_call(
+          args=[iterator_3_handle],
+          Tout=[dtypes.int32],
+          f=_remote_fn,
+          target=target_placeholder)
+
+    with self.test_session(config=worker_config) as sess:
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+          })
+      self.assertEqual(elem, [1])
+      # Fails when target is cpu:0 where the resource is not located.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(
+            remote_op,
+            feed_dict={
+                target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
+            })
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+          })
+      self.assertEqual(elem, [2])
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+          })
+      self.assertEqual(elem, [3])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(
+            remote_op,
+            feed_dict={
+                target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+            })
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8322f0a8975..f7b79e82e16 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -811,12 +811,14 @@ cc_library(
     name = "testlib",
     testonly = 1,
     srcs = [
+        "common_runtime/function_testlib.cc",
         "common_runtime/kernel_benchmark_testlib.cc",
         "framework/fake_input.cc",
         "framework/function_testlib.cc",
         "graph/testlib.cc",
     ],
     hdrs = [
+        "common_runtime/function_testlib.h",
         "common_runtime/kernel_benchmark_testlib.h",
         "framework/fake_input.h",
         "framework/function_testlib.h",
@@ -2661,17 +2663,14 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:dense_update_ops",
-        "//tensorflow/core/kernels:fifo_queue_op",
         "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:matmul_op",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/kernels:queue_ops",
-        "//tensorflow/core/kernels:session_ops",
-        "//tensorflow/core/kernels:variable_ops",
+        "//tensorflow/core/kernels:shape_ops",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 6b529d8f133..4b239606a84 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -139,15 +139,14 @@ static Node* AddRet(Graph* g, Endpoint input, int index) {
   return ret;
 }
 
-static const FunctionLibraryRuntime::Handle kInvalidHandle = -1;
-
 class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
  public:
   FunctionLibraryRuntimeImpl(const DeviceMgr* dmgr, Env* env, Device* device,
                              int graph_def_version,
                              const FunctionLibraryDefinition* lib_def,
                              const OptimizerOptions& optimizer_options,
-                             CustomKernelCreator custom_kernel_creator);
+                             CustomKernelCreator custom_kernel_creator,
+                             ProcessFunctionLibraryRuntime* parent);
 
   ~FunctionLibraryRuntimeImpl() override;
 
@@ -184,17 +183,13 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   const FunctionLibraryDefinition* const lib_def_;
   GraphOptimizer optimizer_;
   const CustomKernelCreator custom_kernel_creator_;
+  const string device_name_;
 
   std::function<Status(const string&, const OpDef**)> get_func_sig_;
   std::function<Status(const NodeDef&, OpKernel**)> create_kernel_;
 
   mutable mutex mu_;
 
-  // Maps function instantiation to a handle. The key is a
-  // canonicalized representation of the function name and
-  // instantiation attrs. The handle is an index into the items_.
-  std::unordered_map<string, Handle> table_ GUARDED_BY(mu_);
-
   // func_graphs_ never shrinks or reorders its members.
   std::vector<FunctionBody*> func_graphs_ GUARDED_BY(mu_);
 
@@ -208,12 +203,16 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   };
   std::vector<Item*> items_;
 
+  ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
+
   Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            FunctionBody** fbody);
   Status CreateItem(Handle handle, Item** item);
   Status GetOrCreateItem(Handle handle, Item** item);
   Status InstantiateSymbolicGradient(const NameAttrList& func,
                                      FunctionBody** g_body);
+  bool IsLocalTarget(const AttrSlice& attrs);
+  AttrValueMap FixAttrs(const AttrSlice& attrs);
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryRuntimeImpl);
 };
@@ -222,14 +221,19 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
     const DeviceMgr* dmgr, Env* env, Device* device, int graph_def_version,
     const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
-    CustomKernelCreator custom_kernel_creator)
+    CustomKernelCreator custom_kernel_creator,
+    ProcessFunctionLibraryRuntime* parent)
     : device_mgr_(dmgr),
       device_(device),
       env_(env),
       graph_def_version_(graph_def_version),
       lib_def_(lib_def),
       optimizer_(optimizer_options),
-      custom_kernel_creator_(std::move(custom_kernel_creator)) {
+      custom_kernel_creator_(std::move(custom_kernel_creator)),
+      device_name_(device_ == nullptr
+                       ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
+                       : device_->name()),
+      parent_(parent) {
   get_func_sig_ = [this](const string& op, const OpDef** sig) {
     return lib_def_->LookUpOpDef(op, sig);
   };
@@ -294,10 +298,17 @@ class CallOp : public AsyncOpKernel {
 };
 
 const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, h);
+  if (local_handle == kInvalidLocalHandle) {
+    LOG(ERROR) << "Could not find Handle: " << h
+               << " on device: " << device_name_;
+    return nullptr;
+  }
+
   mutex_lock l(mu_);
-  CHECK_LE(static_cast<Handle>(0), h);
-  CHECK_LT(h, func_graphs_.size());
-  return func_graphs_[h];
+  CHECK_LE(0, local_handle);
+  CHECK_LT(local_handle, func_graphs_.size());
+  return func_graphs_[local_handle];
 }
 
 Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
@@ -393,22 +404,47 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
   return Status::OK();
 }
 
+bool FunctionLibraryRuntimeImpl::IsLocalTarget(const AttrSlice& attrs) {
+  if (device_ == nullptr) return true;
+  string target = ProcessFunctionLibraryRuntime::ObtainFunctionTarget(attrs);
+  if (target.empty()) return true;
+  return target == device_->name();
+}
+
+AttrValueMap FunctionLibraryRuntimeImpl::FixAttrs(const AttrSlice& attrs) {
+  AttrValueMap value_map;
+  for (auto it : attrs) {
+    value_map[it.first] = it.second;
+  }
+  if (attrs.Find("_target") != nullptr) {
+    return value_map;
+  }
+  AttrValue v;
+  v.set_s(device_name_);
+  AddAttr("_target", v, &value_map);
+  return value_map;
+}
+
 Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
                                                AttrSlice attrs,
                                                Handle* handle) {
-  const string key = Canonicalize(function_name, attrs);
-  {
-    mutex_lock l(mu_);
-    *handle = gtl::FindWithDefault(table_, key, kInvalidHandle);
-    if (*handle != kInvalidHandle) {
-      return Status::OK();
-    }
+  AttrValueMap value_map = FixAttrs(attrs);
+  AttrSlice new_attrs(&value_map);
+
+  if (!IsLocalTarget(new_attrs)) {
+    return parent_->Instantiate(function_name, new_attrs, handle);
+  }
+
+  const string key = Canonicalize(function_name, new_attrs);
+  *handle = parent_->GetHandle(key);
+  if (*handle != kInvalidHandle) {
+    return Status::OK();
   }
 
   Status s;
   FunctionBody* fbody = nullptr;
   if (function_name == kGradientOp) {
-    const AttrValue* f = attrs.Find(kFuncAttr);
+    const AttrValue* f = new_attrs.Find(kFuncAttr);
     if (f == nullptr) {
       return errors::InvalidArgument("SymbolicGradient is missing attr: f");
     }
@@ -426,17 +462,16 @@ Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
     if (fdef == nullptr) {
       return errors::NotFound("Function ", function_name, " is not defined.");
     }
-    TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, &fbody));
+    TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, new_attrs, &fbody));
   }
 
   {
     mutex_lock l(mu_);
-    *handle = gtl::FindWithDefault(table_, key, kInvalidHandle);
+    *handle = parent_->GetHandle(key);
     if (*handle != kInvalidHandle) {
       delete fbody;
     } else {
-      *handle = func_graphs_.size();
-      table_.insert({key, *handle});
+      *handle = parent_->AddHandle(key, device_name_, func_graphs_.size());
       func_graphs_.push_back(fbody);
       items_.resize(func_graphs_.size());
     }
@@ -494,13 +529,14 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
 }
 
 Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
   {
     mutex_lock l(mu_);
-    if (handle >= items_.size()) {
+    if (local_handle >= items_.size()) {
       return errors::NotFound("Function handle ", handle,
                               " is not valid. Likely an internal error.");
     }
-    *item = items_[handle];
+    *item = items_[local_handle];
     if (*item != nullptr) {
       (*item)->Ref();
       return Status::OK();
@@ -512,9 +548,9 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
 
   {
     mutex_lock l(mu_);
-    if (items_[handle] == nullptr) {
+    if (items_[local_handle] == nullptr) {
       // Install *item in items_.
-      items_[handle] = *item;
+      items_[local_handle] = *item;
       (*item)->Ref();
     }
   }
@@ -528,6 +564,9 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   if (opts.cancellation_manager && opts.cancellation_manager->IsCancelled()) {
     return done(errors::Cancelled(""));
   }
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+    return parent_->Run(opts, handle, args, rets, done);
+  }
   const FunctionBody* fbody = GetFunctionBody(handle);
   FunctionCallFrame* frame =
       new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
@@ -616,19 +655,21 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
-    CustomKernelCreator custom_kernel_creator) {
+    CustomKernelCreator custom_kernel_creator,
+    ProcessFunctionLibraryRuntime* parent) {
   return std::unique_ptr<FunctionLibraryRuntime>(new FunctionLibraryRuntimeImpl(
       device_mgr, env, device, graph_def_version, lib_def, optimizer_options,
-      std::move(custom_kernel_creator)));
+      std::move(custom_kernel_creator), parent));
 }
 
 std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options) {
+    const OptimizerOptions& optimizer_options,
+    ProcessFunctionLibraryRuntime* parent) {
   return NewFunctionLibraryRuntime(device_mgr, env, device, graph_def_version,
                                    lib_def, optimizer_options,
-                                   GetCustomCreatorSingleton()->Get());
+                                   GetCustomCreatorSingleton()->Get(), parent);
 }
 
 bool RemoveDeadNodes(Graph* g) {
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 167f0955970..477340d87a3 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -36,9 +37,6 @@ static constexpr const char* const kNoInlineAttr = "_noinline";
 // takes ownership of the returned OpKernel.
 //
 // TODO(zhifengc/phawkins): b/32379046
-typedef std::function<Status(FunctionLibraryRuntime*, const NodeDef&,
-                             std::unique_ptr<OpKernel>*)>
-    CustomKernelCreator;
 void RegisterDefaultCustomKernelCreator(CustomKernelCreator cb);
 
 // Creates a FunctionLibraryRuntime, which instantiates functions
@@ -50,11 +48,16 @@ void RegisterDefaultCustomKernelCreator(CustomKernelCreator cb);
 // The returned object does not take ownerships of "device" or
 // "lib_def".  The caller must ensure "device" and "lib_def" outlives
 // the returned object.
+//
+// The "parent" is a pointer to the ProcessFunctionLibraryRuntime object that
+// typically owns the created FunctionLibraryRuntime object. The parent pointer
+// is not owned by the FunctionLibraryRuntime object.
 std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
-    CustomKernelCreator custom_kernel_creator);
+    CustomKernelCreator custom_kernel_creator,
+    ProcessFunctionLibraryRuntime* parent);
 
 // Same as above except that the returned runtime consults with the
 // global default custom kernel creator registered by
@@ -62,7 +65,8 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
 std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, Device* device,
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options);
+    const OptimizerOptions& optimizer_options,
+    ProcessFunctionLibraryRuntime* parent);
 
 // FunctionLibraryRuntime::GetFunctionBody returns a description of an
 // instantiated function that is represented as a Graph with arg/ret
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 3ca4457b00c..a9f06c4df03 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 
 #include <atomic>
+#include <utility>
 
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
@@ -34,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -49,40 +50,18 @@ Status GetOpSig(const string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
-void FunctionTestSchedClosure(std::function<void()> fn) {
-  static thread::ThreadPool* w =
-      new thread::ThreadPool(Env::Default(), "Test", 8);
-  w->Schedule(std::move(fn));
-}
-
 void HasError(const Status& s, const string& substr) {
   EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
       << s << ", expected substring " << substr;
 }
 
-// A helper class to make AttrSlice from initializer lists
-class Attrs {
- public:
-  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
-        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
-    for (const auto& aval : attrs) {
-      map_.insert({aval.first, aval.second.proto});
-    }
-  }
-
-  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
-
- private:
-  AttrValueMap map_;
-};
-
 class FunctionTest : public ::testing::Test {
  protected:
   FunctionTest()
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  void Create(const FunctionDef& fdef, Attrs attrs) {
+  void Create(const FunctionDef& fdef, test::function::Attrs attrs) {
     exec_ = nullptr;
     InstantiationResult result;
     TF_CHECK_OK(InstantiateFunction(fdef, attrs, GetOpSig, &result));
@@ -117,7 +96,7 @@ class FunctionTest : public ::testing::Test {
     TF_CHECK_OK(frame.SetArgs(args));
     Executor::Args exec_args;
     exec_args.call_frame = &frame;
-    exec_args.runner = FunctionTestSchedClosure;
+    exec_args.runner = test::function::FunctionTestSchedClosure;
     TF_CHECK_OK(exec_->Run(exec_args));
     std::vector<Tensor> computed;
     TF_CHECK_OK(frame.GetRetvals(&computed));
@@ -154,41 +133,42 @@ TEST_F(FunctionTest, WXPlusB) {
 
 class FunctionLibraryRuntimeTest : public ::testing::Test {
  protected:
-  FunctionLibraryRuntimeTest()
-      : device_(DeviceFactory::NewDevice("CPU", {},
-                                         "/job:localhost/replica:0/task:0")) {}
-
   void Init(const std::vector<FunctionDef>& flib) {
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", 3});
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        options, "/job:localhost/replica:0/task:0", &devices_));
+
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    lib_ =
-        NewFunctionLibraryRuntime(nullptr, Env::Default(), device_.get(),
-                                  TF_GRAPH_DEF_VERSION, lib_def_.get(), opts);
+    device_mgr_.reset(new DeviceMgr(devices_));
+    pflr_.reset(new ProcessFunctionLibraryRuntime(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
+        opts));
+    flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+    flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
+    flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
     fdef_lib_ = lib_def_->ToProto();
   }
 
-  Status Run(const string& name, Attrs attrs, const std::vector<Tensor>& args,
-             std::vector<Tensor*> rets) {
-    FunctionLibraryRuntime::Handle handle;
-    Status status = lib_->Instantiate(name, attrs, &handle);
-    if (!status.ok()) {
-      return status;
-    }
-
+  Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
+             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
     std::atomic<int32> call_count(0);
     std::function<void(std::function<void()>)> runner =
         [&call_count](std::function<void()> fn) {
           ++call_count;
-          FunctionTestSchedClosure(fn);
+          test::function::FunctionTestSchedClosure(fn);
         };
 
     Notification done;
     FunctionLibraryRuntime::Options opts;
     opts.runner = &runner;
     std::vector<Tensor> out;
-    lib_->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
+    Status status;
+    flr->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
       status = s;
       done.Notify();
     });
@@ -206,28 +186,54 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  std::unique_ptr<Graph> GetFuncBody(const string& name, Attrs attrs) {
+  Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+                     test::function::Attrs attrs,
+                     FunctionLibraryRuntime::Handle* handle) {
+    Status status = flr->Instantiate(name, attrs, handle);
+    if (!status.ok()) {
+      return status;
+    }
+    return Status::OK();
+  }
+
+  Status InstantiateAndRun(FunctionLibraryRuntime* flr, const string& name,
+                           test::function::Attrs attrs,
+                           const std::vector<Tensor>& args,
+                           std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
-    Status status = lib_->Instantiate(name, attrs, &handle);
+    Status status = flr->Instantiate(name, attrs, &handle);
+    if (!status.ok()) {
+      return status;
+    }
+    return Run(flr, handle, args, std::move(rets));
+  }
+
+  std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
+                                     const string& name,
+                                     test::function::Attrs attrs) {
+    FunctionLibraryRuntime::Handle handle;
+    Status status = flr->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
       LOG(ERROR) << status;
       return nullptr;
     }
-    const FunctionBody* fbody = lib_->GetFunctionBody(handle);
+    const FunctionBody* fbody = flr->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
     std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
     CopyGraph(*fbody->graph, ret.get());
     return ret;
   }
 
-  std::unique_ptr<Graph> GetGradBody(const string& func, Attrs attrs) {
+  std::unique_ptr<Graph> GetGradBody(FunctionLibraryRuntime* flr,
+                                     const string& func,
+                                     test::function::Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
-    Status status = lib_->Instantiate(func, attrs, &handle);
+    Status status = flr->Instantiate(func, attrs, &handle);
     if (!status.ok()) {
       LOG(ERROR) << status;
       return nullptr;
     }
-    const FunctionBody* fbody = lib_->GetFunctionBody(handle);
+    const FunctionBody* fbody = flr->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
     std::unique_ptr<FunctionBody> gbody(SymbolicGradient(*fbody));
     CHECK_NOTNULL(gbody);
@@ -236,24 +242,29 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return ret;
   }
 
-  std::unique_ptr<Device> device_;
+  FunctionLibraryRuntime* flr0_;
+  FunctionLibraryRuntime* flr1_;
+  FunctionLibraryRuntime* flr2_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
-  std::unique_ptr<FunctionLibraryRuntime> lib_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, IsStateful) {
   Init({});
-  EXPECT_TRUE(lib_->IsStateful("Variable"));
-  EXPECT_TRUE(lib_->IsStateful("VariableV2"));
-  EXPECT_FALSE(lib_->IsStateful("Matmul"));
+  EXPECT_TRUE(flr0_->IsStateful("Variable"));
+  EXPECT_TRUE(flr0_->IsStateful("VariableV2"));
+  EXPECT_FALSE(flr0_->IsStateful("Matmul"));
 }
 
 TEST_F(FunctionLibraryRuntimeTest, XTimesTwo) {
   Init({test::function::XTimesTwo()});
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
-  TF_CHECK_OK(Run("XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
+  TF_CHECK_OK(
+      InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
 }
 
@@ -262,11 +273,14 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
         test::function::XTimes16()});
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
-  TF_CHECK_OK(Run("XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
+  TF_CHECK_OK(
+      InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
-  TF_CHECK_OK(Run("XTimesFour", {{"T", DT_FLOAT}}, {x}, {&y}));
+  TF_CHECK_OK(
+      InstantiateAndRun(flr0_, "XTimesFour", {{"T", DT_FLOAT}}, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({4, 8, 12, 16}));
-  TF_CHECK_OK(Run("XTimes16", {{"T", DT_FLOAT}}, {x}, {&y}));
+  TF_CHECK_OK(
+      InstantiateAndRun(flr0_, "XTimes16", {{"T", DT_FLOAT}}, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
 }
 
@@ -294,7 +308,7 @@ Output Call(Scope* scope, const string& op_name, const string& fn_name,
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetFuncBody(flr0_, "XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
 
   {
@@ -312,7 +326,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
-  ExpandInlineFunctions(lib_.get(), g.get());
+  ExpandInlineFunctions(flr0_, g.get());
   {
     Scope s = Scope::NewRootScope();
     TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
@@ -334,7 +348,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
-  ExpandInlineFunctions(lib_.get(), g.get());
+  ExpandInlineFunctions(flr0_, g.get());
   GraphDef e2;
   {
     Scope s = Scope::NewRootScope();
@@ -373,7 +387,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   }
 
   // No further inlining.
-  ExpandInlineFunctions(lib_.get(), g.get());
+  ExpandInlineFunctions(flr0_, g.get());
   {
     GraphDef actual;
     g->ToGraphDef(&actual);
@@ -425,7 +439,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
     TF_ASSERT_OK(s.ToGraph(g.get()));
   }
 
-  ExpandInlineFunctions(lib_.get(), g.get());
+  ExpandInlineFunctions(flr0_, g.get());
   {
     Scope s = Scope::NewRootScope();
     TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
@@ -449,7 +463,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
-  ExpandInlineFunctions(lib_.get(), g.get());
+  ExpandInlineFunctions(flr0_, g.get());
   {
     Scope s = Scope::NewRootScope();
     TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
@@ -495,10 +509,10 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetFuncBody(flr0_, "XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  ExpandInlineFunctions(lib_.get(), g.get());
-  OptimizeGraph(lib_.get(), &g);
+  ExpandInlineFunctions(flr0_, g.get());
+  OptimizeGraph(flr0_, &g);
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
@@ -541,9 +555,9 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
       // Return
       {{"o", "g:output"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsNodeDef", {});
+  std::unique_ptr<Graph> g = GetFuncBody(flr0_, "ManySwapsNodeDef", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_.get(), &g);
+  OptimizeGraph(flr0_, &g);
   const char* e0 = R"P(
 (n3:float, n2:float) -> (n3:float) {
 }
@@ -574,9 +588,9 @@ TEST_F(FunctionLibraryRuntimeTest, ControlDeps) {
        {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
       {{"o", "o:z:0"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsFirst", {});
+  std::unique_ptr<Graph> g = GetFuncBody(flr0_, "ManySwapsFirst", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_.get(), &g);
+  OptimizeGraph(flr0_, &g);
 
   // NOTE: We can remove func0, func1, func2, func9 with a control edge n8->n5.
   // But we don't have a pass doing that.
@@ -609,7 +623,7 @@ TEST_F(FunctionLibraryRuntimeTest, Error_NotFound) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour()});
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
-  HasError(Run("Foo", {{"T", DT_FLOAT}}, {x}, {&y}),
+  HasError(InstantiateAndRun(flr0_, "Foo", {{"T", DT_FLOAT}}, {x}, {&y}),
            "Not found: Function Foo is not defined.");
 }
 
@@ -632,25 +646,27 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 
   // Instantiating "XTimesTwo" should fail.
   FunctionLibraryRuntime::Handle handle;
-  HasError(lib_->Instantiate("XTimesTwo", Attrs({{"T", DT_FLOAT}}), &handle),
+  HasError(flr0_->Instantiate(
+               "XTimesTwo", test::function::Attrs({{"T", DT_FLOAT}}), &handle),
            "Not found: type attr not found");
 
   // But XTimesFour and XTimes16 instantiation should succeed. Only
   // when they run, they fail because XTimesTwo is bad.
-  TF_CHECK_OK(
-      lib_->Instantiate("XTimesFour", Attrs({{"T", DT_FLOAT}}), &handle));
-  TF_CHECK_OK(lib_->Instantiate("XTimes16", Attrs({{"T", DT_FLOAT}}), &handle));
+  TF_CHECK_OK(flr0_->Instantiate(
+      "XTimesFour", test::function::Attrs({{"T", DT_FLOAT}}), &handle));
+  TF_CHECK_OK(flr0_->Instantiate(
+      "XTimes16", test::function::Attrs({{"T", DT_FLOAT}}), &handle));
 
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
-  HasError(Run("XTimes16", {{"T", DT_FLOAT}}, {x}, {&y}),
+  HasError(InstantiateAndRun(flr0_, "XTimes16", {{"T", DT_FLOAT}}, {x}, {&y}),
            "type attr not found");
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> f = GetFuncBody(flr0_, "XTimesTwo", {{"T", DT_FLOAT}});
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
@@ -666,7 +682,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
-  std::unique_ptr<Graph> g = GetGradBody("XTimesTwo", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetGradBody(flr0_, "XTimesTwo", {{"T", DT_FLOAT}});
 
   {
     Scope s = Scope::NewRootScope();
@@ -690,7 +706,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
-  OptimizeGraph(lib_.get(), &g);
+  OptimizeGraph(flr0_, &g);
 
   {
     Scope s = Scope::NewRootScope();
@@ -726,7 +742,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_Add) {
   Init({});
   auto T = DT_FLOAT;
   std::unique_ptr<Graph> g = GetFuncBody(
-      "SymbolicGradient", {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
+      flr0_, "SymbolicGradient", {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
@@ -756,7 +772,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_Mul) {
   Init({});
   auto T = DT_FLOAT;
   std::unique_ptr<Graph> g = GetFuncBody(
-      "SymbolicGradient", {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
+      flr0_, "SymbolicGradient", {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
@@ -812,7 +828,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
 
   Init({test, grad});
 
-  std::unique_ptr<Graph> g = GetFuncBody("TestGrad", {});
+  std::unique_ptr<Graph> g = GetFuncBody(flr0_, "TestGrad", {});
   ASSERT_TRUE(g != nullptr);
   {
     Scope s = Scope::NewRootScope();
@@ -836,7 +852,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
-  ExpandInlineFunctions(lib_.get(), g.get());
+  ExpandInlineFunctions(flr0_, g.get());
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
@@ -888,7 +904,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
-  OptimizeGraph(lib_.get(), &g);
+  OptimizeGraph(flr0_, &g);
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
@@ -939,6 +955,25 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
   }
 }
 
+TEST_F(FunctionLibraryRuntimeTest, CrossDevice) {
+  Init({test::function::FindDevice()});
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(Instantiate(
+      flr0_, "FindDevice",
+      {{"_target", "/job:localhost/replica:0/task:0/cpu:1"}}, &handle));
+
+  Tensor y;
+  // Run on flr1_, flr2_ and make sure that the device it ran on was cpu:1.
+  TF_CHECK_OK(Run(flr1_, handle, {}, {&y}));
+  test::ExpectTensorEqual<string>(
+      y, test::AsTensor<string>({"/job:localhost/replica:0/task:0/cpu:1"},
+                                TensorShape({})));
+  TF_CHECK_OK(Run(flr2_, handle, {}, {&y}));
+  test::ExpectTensorEqual<string>(
+      y, test::AsTensor<string>({"/job:localhost/replica:0/task:0/cpu:1"},
+                                TensorShape({})));
+}
+
 namespace {
 
 bool DoNothing(Graph* g) { return false; }
diff --git a/tensorflow/core/common_runtime/function_testlib.cc b/tensorflow/core/common_runtime/function_testlib.cc
new file mode 100644
index 00000000000..64e59762a2a
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_testlib.cc
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/function_testlib.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace test {
+namespace function {
+
+typedef FunctionDefHelper FDH;
+
+class FindDeviceOpKernel : public OpKernel {
+ public:
+  explicit FindDeviceOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* device_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("device_name", TensorShape{},
+                                             &device_tensor));
+    device_tensor->scalar<string>()() =
+        ctx->function_library()->device()->name();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("FindDeviceOp").Device(tensorflow::DEVICE_CPU),
+                        FindDeviceOpKernel);
+REGISTER_OP("FindDeviceOp").Output("device_name: string");
+
+FunctionDef FindDevice() {
+  return FDH::Define(
+      // Name
+      "FindDevice",
+      // Args
+      {},
+      // Return values
+      {"device_name: string"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"device_name"}, "FindDeviceOp", {}, {}}});
+}
+
+}  // namespace function
+}  // namespace test
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
new file mode 100644
index 00000000000..6b93b188b71
--- /dev/null
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+namespace test {
+namespace function {
+
+// {} -> y:DT_STRING (device where this op runs).
+FunctionDef FindDevice();
+
+}  // namespace function
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 97d891fa16a..0caec036252 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -14,21 +14,58 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace tensorflow {
 
+const char ProcessFunctionLibraryRuntime::kDefaultFLRDevice[] = "null";
+
 ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const DeviceMgr* device_mgr, Env* env, int graph_def_version,
     const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options) {
-  if (!device_mgr) return;
+  if (device_mgr == nullptr) {
+    flr_map_[kDefaultFLRDevice] =
+        NewFunctionLibraryRuntime(nullptr, env, nullptr, graph_def_version,
+                                  lib_def, optimizer_options, this);
+    return;
+  }
+  for (Device* d : device_mgr->ListDevices()) {
+    flr_map_[d->name()] =
+        NewFunctionLibraryRuntime(device_mgr, env, d, graph_def_version,
+                                  lib_def, optimizer_options, this);
+  }
+}
+
+ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
+    const DeviceMgr* device_mgr, Env* env, int graph_def_version,
+    const FunctionLibraryDefinition* lib_def,
+    const OptimizerOptions& optimizer_options,
+    CustomKernelCreator custom_kernel_creator) {
+  if (device_mgr == nullptr) {
+    flr_map_[kDefaultFLRDevice] = NewFunctionLibraryRuntime(
+        nullptr, env, nullptr, graph_def_version, lib_def, optimizer_options,
+        custom_kernel_creator, this);
+  }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d->name()] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, optimizer_options);
+        device_mgr, env, d, graph_def_version, lib_def, optimizer_options,
+        custom_kernel_creator, this);
   }
 }
 
+string ProcessFunctionLibraryRuntime::ObtainFunctionTarget(
+    const AttrSlice& attrs) {
+  const AttrValue* value;
+  if (!attrs.Find("_target", &value).ok()) {
+    return "";
+  }
+  return value->s();
+}
+
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
     const string& device_name) {
   if (flr_map_.find(device_name) == flr_map_.end()) {
@@ -38,4 +75,70 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
   return flr_map_[device_name].get();
 }
 
+FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
+    const string& function_key, const string& device_name,
+    FunctionLibraryRuntime::LocalHandle local_handle) {
+  mutex_lock l(mu_);
+  FunctionLibraryRuntime::Handle h =
+      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+  if (h != kInvalidHandle) {
+    return h;
+  }
+  h = function_data_.size();
+  function_data_.emplace_back(device_name, local_handle);
+  table_[function_key] = h;
+  return h;
+}
+
+FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
+    const string& function_key) const {
+  mutex_lock l(mu_);
+  return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+}
+
+bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
+    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+  return GetHandleOnDevice(device_name, handle) != -1;
+}
+
+FunctionLibraryRuntime::LocalHandle
+ProcessFunctionLibraryRuntime::GetHandleOnDevice(
+    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+  mutex_lock l(mu_);
+  std::pair<string, FunctionLibraryRuntime::LocalHandle> p =
+      function_data_[handle];
+  if (p.first != device_name) {
+    return kInvalidLocalHandle;
+  }
+  return p.second;
+}
+
+Status ProcessFunctionLibraryRuntime::Instantiate(
+    const string& function_name, AttrSlice attrs,
+    FunctionLibraryRuntime::Handle* handle) {
+  string target = ObtainFunctionTarget(attrs);
+
+  FunctionLibraryRuntime* flr = GetFLR(target);
+  if (flr != nullptr) {
+    return flr->Instantiate(function_name, attrs, handle);
+  }
+  return errors::InvalidArgument("Target: ", target, " is not supported");
+}
+
+void ProcessFunctionLibraryRuntime::Run(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+    std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
+  FunctionLibraryRuntime* flr = nullptr;
+  {
+    mutex_lock l(mu_);
+    std::pair<string, FunctionLibraryRuntime::LocalHandle> p =
+        function_data_[handle];
+    flr = GetFLR(p.first);
+  }
+  if (flr != nullptr) {
+    return flr->Run(opts, handle, args, rets, std::move(done));
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 53b2223b28f..2259997005e 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -24,7 +24,6 @@ limitations under the License.
 namespace tensorflow {
 
 // A class that stores all the FunctionLibraryRuntime objects, one per device.
-// This class is not thread safe.
 class ProcessFunctionLibraryRuntime {
  public:
   // Creates FunctionLibraryRuntime objects for each device in the provided
@@ -35,10 +34,64 @@ class ProcessFunctionLibraryRuntime {
                                 const FunctionLibraryDefinition* lib_def,
                                 const OptimizerOptions& optimizer_options);
 
+  ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
+                                int graph_def_version,
+                                const FunctionLibraryDefinition* lib_def,
+                                const OptimizerOptions& optimizer_options,
+                                CustomKernelCreator custom_kernel_creator);
+
+  // Given a list of attrs on a function, extracts the "_target" attribute which
+  // indicates which device to run the function on. If it can't find the _target
+  // attribute, returns "". Canonicalizes the device name.
+  static string ObtainFunctionTarget(const AttrSlice& attrs);
+
+  static const char kDefaultFLRDevice[];
   // Returns the FunctionLibraryRuntime for the corresponding device_name.
   FunctionLibraryRuntime* GetFLR(const string& device_name);
 
+  // For a given canonicalized key signature of the function instantiated
+  // on device `device_name` and a `local_handle`, creates a handle and returns
+  // that value. Use core/common_runtime/framework/function.h::Canonicalize
+  // to canonicalize the function signature.
+  FunctionLibraryRuntime::Handle AddHandle(
+      const string& function_key, const string& device_name,
+      FunctionLibraryRuntime::LocalHandle local_handle);
+
+  // Returns a handle if found for the given key, else returns kInvalidHandle.
+  FunctionLibraryRuntime::Handle GetHandle(const string& function_key) const;
+
+  // For the given handle instantiated on device `device_name` returns the local
+  // index of instantiation of that function. If the function was not
+  // instantiated on `device_name` returns kInvalidLocalHandle.
+  FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
+      const string& device_name, FunctionLibraryRuntime::Handle handle);
+
+  // Returns true if function with handle `handle` was instantiated on device
+  // `device_name`.
+  bool IsInstantiatedOnDevice(const string& device_name,
+                              FunctionLibraryRuntime::Handle handle);
+
+  // Instantiates the function. See framework/function.h for more details.
+  // Allows for function_name to be instantiated on different devices
+  // as specified in attrs.
+  Status Instantiate(const string& function_name, AttrSlice attrs,
+                     FunctionLibraryRuntime::Handle* handle);
+
+  // Runs the function with given `handle`. Function could have been
+  // instantiated on any device. More details in framework/function.h
+  void Run(const FunctionLibraryRuntime::Options& opts,
+           FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+           std::vector<Tensor>* rets,
+           FunctionLibraryRuntime::DoneCallback done);
+
  private:
+  mutable mutex mu_;
+
+  // Holds all the function invocations here.
+  std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
+      GUARDED_BY(mu_);
+  std::vector<std::pair<string, FunctionLibraryRuntime::LocalHandle>>
+      function_data_ GUARDED_BY(mu_);
   std::unordered_map<string, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
 };
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index d9a5cab88b9..1536aedde58 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -17,6 +17,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -25,8 +28,8 @@ namespace tensorflow {
 namespace {
 
 class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
- public:
-  ProcessFunctionLibraryRuntimeTest() {
+ protected:
+  void Init(const std::vector<FunctionDef>& flib) {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 2});
@@ -34,6 +37,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                                           &devices_));
     device_mgr_.reset(new DeviceMgr(devices_));
     FunctionDefLibrary proto;
+    for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
@@ -41,7 +45,43 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
         opts));
   }
 
- protected:
+  Status Run(const string& name, test::function::Attrs attrs,
+             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+    FunctionLibraryRuntime::Handle handle;
+    Status status = proc_flr_->Instantiate(name, attrs, &handle);
+    if (!status.ok()) {
+      return status;
+    }
+
+    std::atomic<int32> call_count(0);
+    std::function<void(std::function<void()>)> runner =
+        [&call_count](std::function<void()> fn) {
+          ++call_count;
+          test::function::FunctionTestSchedClosure(fn);
+        };
+
+    Notification done;
+    FunctionLibraryRuntime::Options opts;
+    opts.runner = &runner;
+    std::vector<Tensor> out;
+    proc_flr_->Run(opts, handle, args, &out, [&status, &done](const Status& s) {
+      status = s;
+      done.Notify();
+    });
+    done.WaitForNotification();
+    if (!status.ok()) {
+      return status;
+    }
+    CHECK_EQ(rets.size(), out.size());
+    for (size_t i = 0; i < rets.size(); ++i) {
+      *rets[i] = out[i];
+    }
+
+    EXPECT_GE(call_count, 1);  // Test runner is used.
+
+    return Status::OK();
+  }
+
   std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
@@ -49,6 +89,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 };
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, Basic) {
+  Init({});
   FunctionLibraryRuntime* flr =
       proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:0");
   EXPECT_NE(flr, nullptr);
@@ -60,5 +101,87 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, Basic) {
   EXPECT_EQ(flr, nullptr);
 }
 
+TEST_F(ProcessFunctionLibraryRuntimeTest, ObtainFunctionTarget) {
+  AttrSlice empty_attrs;
+  string target =
+      ProcessFunctionLibraryRuntime::ObtainFunctionTarget(empty_attrs);
+  EXPECT_EQ("", target);
+
+  AttrValueMap attr_values;
+  AttrValue v;
+  v.set_s("/job:a/replica:0/task:0/cpu:1");
+  AddAttr("_target", v, &attr_values);
+  AttrSlice attrs(&attr_values);
+  target = ProcessFunctionLibraryRuntime::ObtainFunctionTarget(attrs);
+  EXPECT_EQ("/job:a/replica:0/task:0/cpu:1", target);
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCall) {
+  Init({test::function::XTimesTwo()});
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y;
+  TF_CHECK_OK(
+      Run("XTimesTwo",
+          {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, {x},
+          {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCallFindDevice) {
+  Init({test::function::FindDevice()});
+  Tensor y;
+  TF_CHECK_OK(Run("FindDevice", {{"_target", "/job:a/replica:0/task:0/cpu:0"}},
+                  {}, {&y}));
+  test::ExpectTensorEqual<string>(
+      y, test::AsTensor<string>({"/job:a/replica:0/task:0/cpu:0"},
+                                TensorShape({})));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
+  Init({test::function::XTimesTwo(), test::function::XTimesFour()});
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y;
+  TF_CHECK_OK(
+      Run("XTimesTwo",
+          {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, {x},
+          {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  TF_CHECK_OK(
+      Run("XTimesFour",
+          {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, {x},
+          {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({4, 8, 12, 16}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceFindDevice) {
+  Init({test::function::FindDevice()});
+  Tensor y;
+  TF_CHECK_OK(Run("FindDevice", {{"_target", "/job:a/replica:0/task:0/cpu:1"}},
+                  {}, {&y}));
+  test::ExpectTensorEqual<string>(
+      y, test::AsTensor<string>({"/job:a/replica:0/task:0/cpu:1"},
+                                TensorShape({})));
+  TF_CHECK_OK(Run("FindDevice", {{"_target", "/job:a/replica:0/task:0/cpu:1"}},
+                  {}, {&y}));
+  test::ExpectTensorEqual<string>(
+      y, test::AsTensor<string>({"/job:a/replica:0/task:0/cpu:1"},
+                                TensorShape({})));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsDiffDeviceFindDevice) {
+  Init({test::function::FindDevice()});
+  Tensor y;
+  TF_CHECK_OK(Run("FindDevice", {{"_target", "/job:a/replica:0/task:0/cpu:0"}},
+                  {}, {&y}));
+  test::ExpectTensorEqual<string>(
+      y, test::AsTensor<string>({"/job:a/replica:0/task:0/cpu:0"},
+                                TensorShape({})));
+  TF_CHECK_OK(Run("FindDevice", {{"_target", "/job:a/replica:0/task:0/cpu:1"}},
+                  {}, {&y}));
+  test::ExpectTensorEqual<string>(
+      y, test::AsTensor<string>({"/job:a/replica:0/task:0/cpu:1"},
+                                TensorShape({})));
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 045976dd06a..717f0c85755 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -437,8 +437,16 @@ class FunctionLibraryRuntime {
 
   // Returns the graph version number.
   virtual int graph_def_version() = 0;
+
+  typedef uint64 LocalHandle;
 };
 
+const FunctionLibraryRuntime::Handle kInvalidHandle = -1;
+const FunctionLibraryRuntime::LocalHandle kInvalidLocalHandle = -1;
+typedef std::function<Status(FunctionLibraryRuntime*, const NodeDef&,
+                             std::unique_ptr<OpKernel>*)>
+    CustomKernelCreator;
+
 // To register a gradient function for a builtin op, one should use
 //   REGISTER_OP_GRADIENT(<op_name>, <c++ grad factory>);
 //
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 4ee23226daa..e6ef8425fb0 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -172,6 +173,12 @@ FunctionDef Swap() {
        {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
 }
 
+void FunctionTestSchedClosure(std::function<void()> fn) {
+  static thread::ThreadPool* w =
+      new thread::ThreadPool(Env::Default(), "Test", 8);
+  w->Schedule(std::move(fn));
+}
+
 }  // end namespace function
 }  // end namespace test
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 49e5b0c99d9..a742fe0ce7a 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -30,6 +30,22 @@ namespace tensorflow {
 namespace test {
 namespace function {
 
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
+
 // Helper to construct a NodeDef.
 NodeDef NDef(
     const string& name, const string& op, gtl::ArraySlice<string> inputs,
@@ -62,6 +78,8 @@ FunctionDef NonZero();
 // x:T, y:T -> y:T, x:T
 FunctionDef Swap();
 
+void FunctionTestSchedClosure(std::function<void()> fn);
+
 }  // end namespace function
 }  // end namespace test
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 61366514102..b740e8a999e 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -104,9 +104,11 @@ Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def,
   optimizer_opts->set_do_function_inlining(cfg.inline_functions);
 
   // Create the function library runtime.
-  std::unique_ptr<FunctionLibraryRuntime> flib(NewFunctionLibraryRuntime(
-      dvc_mgr.get(), env, devices[0], inlined_graph_def.versions().producer(),
-      &function_library, *optimizer_opts));
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(dvc_mgr.get(), env,
+                                        inlined_graph_def.versions().producer(),
+                                        &function_library, *optimizer_opts));
+  FunctionLibraryRuntime* flr = pflr->GetFLR(devices[0]->name());
 
   // Create the GraphOptimizer to optimize the graph def.
   GraphConstructorOptions graph_ctor_opts;
@@ -122,8 +124,7 @@ Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def,
 
   // Optimize the graph.
   GraphOptimizer optimizer(*optimizer_opts);
-  optimizer.Optimize(flib.get(), env, devices[0], &graphptr,
-                     /*shape_map=*/nullptr);
+  optimizer.Optimize(flr, env, devices[0], &graphptr, /*shape_map=*/nullptr);
   graphptr->ToGraphDef(output_graph_def);
 
   return Status::OK();
diff --git a/tensorflow/core/kernels/captured_function.cc b/tensorflow/core/kernels/captured_function.cc
index eb52de6d85e..15e9680f262 100644
--- a/tensorflow/core/kernels/captured_function.cc
+++ b/tensorflow/core/kernels/captured_function.cc
@@ -40,9 +40,9 @@ Status CapturedFunction::Create(
   // NOTE(mrry): We need to assign a name to the device, and we choose
   // the same name as the calling context's device so that we do not
   // need to rewrite resource handles that are found in `captured_inputs`.
-  std::unique_ptr<Device> device(new ThreadPoolDevice(
-      SessionOptions(), ctx->device()->attributes().name(), Bytes(256 << 20),
-      DeviceLocality(), cpu_allocator()));
+  Device* device =
+      new ThreadPoolDevice(SessionOptions(), ctx->device()->attributes().name(),
+                           Bytes(256 << 20), DeviceLocality(), cpu_allocator());
 
 // TODO(mrry): Handle arbitrary resource types, which might require a
 // redesign (or opening up access to `ResourceMgr::DoLookup()` and
@@ -82,20 +82,24 @@ Status CapturedFunction::Create(
   }
 #undef HANDLE_RESOURCE_TYPE
 
+  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr({device}));
   std::unique_ptr<FunctionLibraryDefinition> flib_def(
       new FunctionLibraryDefinition(
           *ctx->function_library()->GetFunctionLibraryDefinition()));
-  std::unique_ptr<FunctionLibraryRuntime> lib(NewFunctionLibraryRuntime(
-      nullptr /* device_mgr */, ctx->env(), device.get(), graph_def_version,
-      flib_def.get(), {} /* TODO(mrry): OptimizerOptions? */));
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(
+          device_mgr.get(), ctx->env(), graph_def_version, flib_def.get(),
+          {} /* TODO(mrry): OptimizerOptions? */));
+
+  FunctionLibraryRuntime* lib = pflr->GetFLR(device->name());
 
   FunctionLibraryRuntime::Handle f_handle;
   TF_RETURN_IF_ERROR(
       lib->Instantiate(func->name(), AttrSlice(&func->attr()), &f_handle));
 
   out_function->reset(new CapturedFunction(
-      std::move(device), std::move(flib_def), std::move(lib), f_handle,
-      std::move(captured_inputs)));
+      device, std::move(device_mgr), std::move(flib_def), std::move(pflr), lib,
+      f_handle, std::move(captured_inputs)));
   return Status::OK();
 }
 
@@ -136,14 +140,16 @@ Status CapturedFunction::Run(FunctionLibraryRuntime::Options f_opts,
 }
 
 CapturedFunction::CapturedFunction(
-    std::unique_ptr<Device> device,
+    Device* device, std::unique_ptr<DeviceMgr> device_mgr,
     std::unique_ptr<FunctionLibraryDefinition> flib_def,
-    std::unique_ptr<FunctionLibraryRuntime> lib,
-    FunctionLibraryRuntime::Handle f_handle,
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+    FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
     std::vector<Tensor> captured_inputs)
-    : device_(std::move(device)),
+    : device_(device),
+      device_mgr_(std::move(device_mgr)),
       flib_def_(std::move(flib_def)),
-      lib_(std::move(lib)),
+      pflr_(std::move(pflr)),
+      lib_(lib),
       f_handle_(f_handle),
       captured_inputs_(std::move(captured_inputs)) {}
 
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
index e24bcb9d829..03679736f35 100644
--- a/tensorflow/core/kernels/captured_function.h
+++ b/tensorflow/core/kernels/captured_function.h
@@ -63,20 +63,23 @@ class CapturedFunction {
              gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
              const string& prefix);
 
-  Device* device() const { return device_.get(); }
+  const Device* device() const { return device_; }
 
   ResourceMgr* resource_manager() const { return device_->resource_manager(); }
 
  private:
-  CapturedFunction(std::unique_ptr<Device> device,
+  CapturedFunction(Device* device, std::unique_ptr<DeviceMgr> device_mgr,
                    std::unique_ptr<FunctionLibraryDefinition> flib_def,
-                   std::unique_ptr<FunctionLibraryRuntime> lib,
+                   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                   FunctionLibraryRuntime* lib,
                    FunctionLibraryRuntime::Handle f_handle,
                    std::vector<Tensor> captured_inputs);
 
-  const std::unique_ptr<Device> device_;
+  Device* const device_;  // owned by device_mgr_.
+  const std::unique_ptr<DeviceMgr> device_mgr_;
   const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  const std::unique_ptr<FunctionLibraryRuntime> lib_;
+  const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* const lib_;  // owned by pflr_.
   const FunctionLibraryRuntime::Handle f_handle_;
   const std::vector<Tensor> captured_inputs_;
 
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index b831b5bff55..a1dfd4c3d31 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -277,5 +277,67 @@ REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_GPU),
 REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_SYCL),
                         SymbolicGradientOp);
 
+#endif  // TENSORFLOW_USE_SYCL
+
+class RemoteCallOp : public AsyncOpKernel {
+ public:
+  explicit RemoteCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+  }
+
+  ~RemoteCallOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor* target;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input("target", &target), done);
+    AttrValueMap attr_values = func_->attr();
+    AttrValue v;
+    v.set_s(target->scalar<string>()());
+    AddAttr("_target", v, &attr_values);
+
+    FunctionLibraryRuntime* lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library is provided."),
+                      done);
+    FunctionLibraryRuntime::Handle handle;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, lib->Instantiate(func_->name(), AttrSlice(&attr_values), &handle),
+        done);
+
+    OpInputList arguments;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
+
+    FunctionLibraryRuntime::Options opts;
+    opts.step_id = ctx->step_id();
+    opts.runner = ctx->runner();
+    std::vector<Tensor> args;
+    args.reserve(arguments.size());
+    for (const Tensor& argument : arguments) {
+      args.push_back(argument);
+    }
+    auto* rets = new std::vector<Tensor>;
+    lib->Run(opts, handle, args, rets, [rets, done, ctx](const Status& status) {
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+      }
+      for (size_t i = 0; i < rets->size(); ++i) {
+        ctx->set_output(i, (*rets)[i]);
+      }
+      delete rets;
+      done();
+    });
+  }
+
+ private:
+  string target_;
+  const NameAttrList* func_;
+  TF_DISALLOW_COPY_AND_ASSIGN(RemoteCallOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("RemoteCall").Device(DEVICE_CPU), RemoteCallOp);
+REGISTER_KERNEL_BUILDER(Name("RemoteCall").Device(DEVICE_GPU), RemoteCallOp);
+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("RemoteCall").Device(DEVICE_SYCL), RemoteCallOp);
+
 #endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index d1f9e949425..5fd21ec88fa 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -65,4 +66,22 @@ to x_i.
 (Needs some math expert to say the comment above better.)
 )doc");
 
+REGISTER_OP("RemoteCall")
+    .Input("target: string")
+    .Input("args: Tin")
+    .Output("output: Tout")
+    .Attr("Tin: list(type)")
+    .Attr("Tout: list(type)")
+    .Attr("f: func")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Runs function `f` on a remote device indicated by `target`.
+
+target: A fully specified device name where we want to run the function.
+args: A list of arguments for the function.
+output: A list of return values.
+Tin: The type list for the arguments.
+Tout: The type list for the return values.
+f: The function to run remotely.
+)doc");
 }  // end namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index e0231c460e8..a7bedc7199c 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -20,10 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
@@ -446,6 +450,56 @@ class FunctionalOpsTest(test.TestCase):
       sess.run([result, result_t, result_grad, result_t_grad],
                feed_dict={x: [[1.0, 2.0]]})
 
+  def testRemoteFunction(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    worker, _ = test_util.create_local_cluster(
+        1, 1, worker_config=worker_config)
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def _remote_fn(a, b):
+      return math_ops.multiply(a, b)
+
+    with ops.device("/job:ps/task:0"):
+      a = variables.Variable(2, dtype=dtypes.int32)
+      b = variables.Variable(3, dtype=dtypes.int32)
+
+    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
+      remote_op = functional_ops.remote_call(
+          args=[a, b],
+          Tout=[dtypes.int32],
+          f=_remote_fn,
+          target="/job:worker/replica:0/task:0/cpu:1")
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(variables.global_variables_initializer())
+      mul = sess.run(remote_op)
+      self.assertEqual(mul, [6])
+
+  def testRemoteFunctionDirectSession(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def _remote_fn(a, b):
+      return math_ops.multiply(a, b)
+
+    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
+      a = variables.Variable(2, dtype=dtypes.int32)
+      b = variables.Variable(3, dtype=dtypes.int32)
+
+    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
+      remote_op = functional_ops.remote_call(
+          args=[a, b],
+          Tout=[dtypes.int32],
+          f=_remote_fn,
+          target="/job:localhost/replica:0/task:0/cpu:1")
+
+    with self.test_session(config=worker_config) as sess:
+      sess.run(variables.global_variables_initializer())
+      mul = sess.run(remote_op)
+      self.assertEqual(mul, [6])
+
 
 if __name__ == "__main__":
   test.main()

From e31346452d91c48fa9b3deff8df575ccbd7f877a Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Thu, 17 Aug 2017 17:26:07 -0700
Subject: [PATCH 05/70] TPUEstimator: Fix the outfeed thread join.

PiperOrigin-RevId: 165651781
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index c7b84f952f9..3622dff29b9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -283,7 +283,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
     if self._dequeue_ops is not None:
       logging.info('Stop output thread controller')
-      self._infeed_thd_controller.join()
+      self._outfeed_thd_controller.join()
 
     logging.info('Shutdown TPU system.')
     session.run(self._finalize_op)

From 641943fd71c6e42ff3d6c71af45199dea4895976 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 17:35:54 -0700
Subject: [PATCH 06/70] Update ops-related pbtxt files.

PiperOrigin-RevId: 165652758
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 31 +++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 38 +++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index e76573ffdb1..6ff1a3fc038 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -19996,6 +19996,37 @@ op {
     }
   }
 }
+op {
+  name: "RemoteCall"
+  input_arg {
+    name: "target"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
 op {
   name: "RemoteFusedGraphExecute"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 06eabdcdcd6..87cdc30fb1b 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -19607,6 +19607,44 @@ op {
   }
   summary: "Computes rectified linear gradients for a Relu operation."
 }
+op {
+  name: "RemoteCall"
+  input_arg {
+    name: "target"
+    description: "A fully specified device name where we want to run the function."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    description: "A list of arguments for the function."
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    description: "A list of return values."
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    description: "The type list for the arguments."
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    description: "The type list for the return values."
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+    description: "The function to run remotely."
+  }
+  summary: "Runs function `f` on a remote device indicated by `target`."
+}
 op {
   name: "RemoteFusedGraphExecute"
   input_arg {

From 465c408196210efcdeb792b72801fdec7b7db868 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 17:44:32 -0700
Subject: [PATCH 07/70] Fix the shape information propagation for Enter op.

PiperOrigin-RevId: 165653579
---
 tensorflow/core/ops/control_flow_ops.cc             |  7 +++++++
 .../python/kernel_tests/control_flow_ops_py_test.py | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 9e39b396e1f..61089658d71 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -204,6 +204,13 @@ REGISTER_OP("Enter")
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr) {
         c->set_output_handle_shapes_and_types(0, *handle_data);
+      } else {
+        // Otherwise, propagate shape if output is a constant.
+        bool is_constant;
+        TF_RETURN_IF_ERROR(c->GetAttr("is_constant", &is_constant));
+        if (is_constant) {
+          c->set_output(0, c->input(0));
+        }
       }
 
       return Status::OK();
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index fdecea1dc10..a43fe71b9f3 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -179,6 +179,19 @@ class ControlFlowTest(test.TestCase):
       result = exit_op.eval()
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  def testEnterShapePropagation(self):
+    with self.test_session():
+      v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
+
+      # If is_constant=True, the shape information should be propagated.
+      enter_v_constant = control_flow_ops.enter(v, "frame1", is_constant=True)
+      self.assertEqual(enter_v_constant.shape, [2])
+
+      # Otherwise, the shape should be unknown.
+      enter_v_non_constant = control_flow_ops.enter(v, "frame2",
+                                                    is_constant=False)
+      self.assertEqual(enter_v_non_constant.shape, None)
+
   def testSwitchMergeIndexedSlices(self):
     with self.test_session():
       values = constant_op.constant([1, 2, 3, 4, 5, 6])

From d7e425f0bd61676aa347a93a81d8e89bb5c1a1a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 17:48:29 -0700
Subject: [PATCH 08/70] Fix linear algebra benchmarks.

PiperOrigin-RevId: 165653891
---
 .../python/kernel_tests/cholesky_op_test.py   | 105 +++++++++++-------
 .../kernel_tests/determinant_op_test.py       |  37 +++---
 .../kernel_tests/matrix_inverse_op_test.py    |  41 ++++---
 3 files changed, 110 insertions(+), 73 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index d783522e820..de80fb30554 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -81,8 +82,11 @@ def MatrixInverseCompositeGrad(l, grad):
 def TriAngInvCompositeGrad(l, grad):
   num_rows = array_ops.shape(l)[-1]
   batch_shape = array_ops.shape(l)[:-2]
-  l_inverse = linalg_ops.matrix_triangular_solve(
-      l, linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=l.dtype))
+  l_inverse = linalg_ops.matrix_triangular_solve(l,
+                                                 linalg_ops.eye(
+                                                     num_rows,
+                                                     batch_shape=batch_shape,
+                                                     dtype=l.dtype))
   return _GradWithInverseL(l, l_inverse, grad)
 
 
@@ -281,75 +285,94 @@ class CholeskyGradTest(test.TestCase):
 
 class CholeskyBenchmark(test.Benchmark):
 
-  sizes = [
-      (4, 4), (16, 16), (256, 256), (1024, 1024), (2048, 2048),
-      (513, 2, 2), (513, 8, 8), (4, 513, 2, 2)
+  shapes = [
+      (4, 4),
+      (10, 10),
+      (16, 16),
+      (101, 101),
+      (256, 256),
+      (1000, 1000),
+      (1024, 1024),
+      (2048, 2048),
+      (513, 2, 2),
+      (513, 8, 8),
+      (513, 256, 256),
+      (4, 513, 2, 2),
   ]
 
-  def _GenerateData(self, size):
-    batch_shape = size[:-2]
-    size = size[-2:]
-    assert size[0] == size[1]
-    n = size[0]
-    data = np.ones(size).astype(np.float32) / (2.0 * n) + np.diag(
-        np.ones(n).astype(np.float32))
-    return np.tile(data, batch_shape + (1, 1))
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.float32) / (
+        2.0 * n) + np.diag(np.ones(n).astype(np.float32))
+    return np.tile(matrix, batch_shape + (1, 1))
 
   def benchmarkCholeskyOp(self):
-    for size in self.sizes:
-      data = self._GenerateData(size)
-
+    for shape in self.shapes:
       with ops.Graph().as_default(), \
           session.Session() as sess, \
           ops.device("/cpu:0"):
-        l = linalg_ops.cholesky(data)
+        matrix = variables.Variable(self._GenerateMatrix(shape))
+        l = linalg_ops.cholesky(matrix)
+        variables.global_variables_initializer().run()
         self.run_op_benchmark(
-            sess, control_flow_ops.group(l,),
+            sess,
+            control_flow_ops.group(
+                l,),
             min_iters=25,
-            name="cholesky_cpu_{size}".format(size=size))
+            name="cholesky_cpu_{shape}".format(shape=shape))
 
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), \
             session.Session() as sess, \
             ops.device("/device:GPU:0"):
-          l = linalg_ops.cholesky(data)
+          matrix = variables.Variable(self._GenerateMatrix(shape))
+          l = linalg_ops.cholesky(matrix)
+          variables.global_variables_initializer().run()
           self.run_op_benchmark(
               sess,
               control_flow_ops.group(
                   l,),
               min_iters=25,
-              name="cholesky_gpu_{size}".format(size=size))
+              name="cholesky_gpu_{shape}".format(shape=shape))
 
   def benchmarkGradVariants(self):
+
     def _BenchmarkGrad(grad_fn, name, device):
-      for size in self.sizes:
-        data = self._GenerateData(size)
-        l = np.linalg.cholesky(data)
-        grad_data = np.random.randn(*data.shape).astype(np.float32)
+      for shape in self.shapes:
+        matrix = self._GenerateMatrix(shape)
         with ops.Graph().as_default(), \
             session.Session() as sess, \
             ops.device(device):
-          grad = grad_fn(l, grad_data)
+          l = variables.Variable(np.linalg.cholesky(matrix))
+          grad_matrix = variables.Variable(
+              np.random.randn(*matrix.shape).astype(np.float32))
+          grad = grad_fn(l, grad_matrix)
+          variables.global_variables_initializer().run()
           self.run_op_benchmark(
-              sess, control_flow_ops.group(grad,),
+              sess,
+              control_flow_ops.group(
+                  grad,),
               min_iters=25,
-              name="{name}_{dev}_{size}".format(
-                  name=name, dev=grad.device, size=size))
+              name="{name}_{dev}_{shape}".format(
+                  name=name, dev=grad.device, shape=shape))
 
     if test.is_gpu_available(True):
-      _BenchmarkGrad(
-          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/device:GPU:0")
-      _BenchmarkGrad(
-          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/device:GPU:0")
-      _BenchmarkGrad(
-          TriAngSolveCompositeGrad, "composite_triangular_solve", "/device:GPU:0")
+      _BenchmarkGrad(MatrixInverseCompositeGrad, "composite_matrix_inverse",
+                     "/device:GPU:0")
+      _BenchmarkGrad(TriAngInvCompositeGrad, "composite_tri_ang_inverse",
+                     "/device:GPU:0")
+      _BenchmarkGrad(TriAngSolveCompositeGrad, "composite_triangular_solve",
+                     "/device:GPU:0")
 
-    _BenchmarkGrad(
-        MatrixInverseCompositeGrad, "composite_matrix_inverse", "/cpu:0")
-    _BenchmarkGrad(
-        TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/cpu:0")
-    _BenchmarkGrad(
-        TriAngSolveCompositeGrad, "composite_triangular_solve", "/cpu:0")
+    _BenchmarkGrad(MatrixInverseCompositeGrad, "composite_matrix_inverse",
+                   "/cpu:0")
+    _BenchmarkGrad(TriAngInvCompositeGrad, "composite_tri_ang_inverse",
+                   "/cpu:0")
+    _BenchmarkGrad(TriAngSolveCompositeGrad, "composite_triangular_solve",
+                   "/cpu:0")
     _BenchmarkGrad(SpecializedGrad, "specialized", "/cpu:0")
 
 
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index b9fc1104056..4f07322d61c 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -130,49 +131,55 @@ class DeterminantOpTest(test.TestCase):
 
 class MatrixDeterminantBenchmark(test.Benchmark):
 
-  sizes = [
+  shapes = [
       (4, 4),
+      (10, 10),
       (16, 16),
+      (101, 101),
       (256, 256),
+      (1000, 1000),
       (1024, 1024),
+      (2048, 2048),
       (513, 4, 4),
       (513, 16, 16),
       (513, 256, 256),
   ]
 
-  def _GenerateData(self, size):
-    batch_shape = size[:-2]
-    size = size[-2:]
-    assert size[0] == size[1]
-    n = size[0]
-    data = np.ones(size).astype(np.float32) / (
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.float32) / (
         2.0 * n) + np.diag(np.ones(n).astype(np.float32))
-    return np.tile(data, batch_shape + (1, 1))
+    return variables.Variable(np.tile(matrix, batch_shape + (1, 1)))
 
   def benchmarkMatrixDeterminantOp(self):
-    for size in self.sizes:
-      data = self._GenerateData(size)
-
+    for shape in self.shapes:
       with ops.Graph().as_default(), session.Session() as sess, ops.device(
           "/cpu:0"):
-        d = linalg_ops.matrix_determinant(data)
+        matrix = self._GenerateMatrix(shape)
+        d = linalg_ops.matrix_determinant(matrix)
+        variables.global_variables_initializer().run()
         self.run_op_benchmark(
             sess,
             control_flow_ops.group(
                 d,),
             min_iters=25,
-            name="matrix_determinant_cpu_{size}".format(size=size))
+            name="matrix_determinant_cpu_{shape}".format(shape=shape))
 
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), session.Session() as sess, ops.device(
             "/gpu:0"):
-          d = linalg_ops.matrix_determinant(data)
+          matrix = self._GenerateMatrix(shape)
+          d = linalg_ops.matrix_determinant(matrix)
+          variables.global_variables_initializer().run()
           self.run_op_benchmark(
               sess,
               control_flow_ops.group(
                   d,),
               min_iters=25,
-              name="matrix_determinant_gpu_{size}".format(size=size))
+              name="matrix_determinant_gpu_{shape}".format(shape=shape))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 601084c8307..7343a02c2cd 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -137,52 +138,58 @@ class InverseOpTest(test.TestCase):
 
 class MatrixInverseBenchmark(test.Benchmark):
 
-  sizes = [
+  shapes = [
       (4, 4),
+      (10, 10),
       (16, 16),
+      (101, 101),
       (256, 256),
+      (1000, 1000),
       (1024, 1024),
+      (2048, 2048),
       (513, 4, 4),
       (513, 16, 16),
       (513, 256, 256),
   ]
 
-  def _GenerateData(self, size):
-    batch_shape = size[:-2]
-    size = size[-2:]
-    assert size[0] == size[1]
-    n = size[0]
-    data = np.ones(size).astype(np.float32) / (
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.float32) / (
         2.0 * n) + np.diag(np.ones(n).astype(np.float32))
-    return np.tile(data, batch_shape + (1, 1))
+    return variables.Variable(np.tile(matrix, batch_shape + (1, 1)))
 
   def benchmarkMatrixInverseOp(self):
     for adjoint in False, True:
-      for size in self.sizes:
-        data = self._GenerateData(size)
-
+      for shape in self.shapes:
         with ops.Graph().as_default(), \
             session.Session() as sess, \
             ops.device("/cpu:0"):
-          inv = linalg_ops.matrix_inverse(data, adjoint=adjoint)
+          matrix = self._GenerateMatrix(shape)
+          inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint)
+          variables.global_variables_initializer().run()
           self.run_op_benchmark(
               sess,
               control_flow_ops.group(inv),
               min_iters=25,
-              name="matrix_inverse_cpu_{size}_{adjoint}".format(
-                  size=size, adjoint="adjoint" if adjoint else "noadjoint"))
+              name="matrix_inverse_cpu_{shape}_adjoint_{adjoint}".format(
+                  shape=shape, adjoint=adjoint))
 
         if test.is_gpu_available(True):
           with ops.Graph().as_default(), \
               session.Session() as sess, \
               ops.device("/gpu:0"):
-            inv = linalg_ops.matrix_inverse(data, adjoint=adjoint)
+            matrix = self._GenerateMatrix(shape)
+            inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint)
+            variables.global_variables_initializer().run()
             self.run_op_benchmark(
                 sess,
                 control_flow_ops.group(inv),
                 min_iters=25,
-                name="matrix_inverse_gpu_{size}_{adjoint}".format(
-                    size=size, adjoint="adjoint" if adjoint else "noadjoint"))
+                name="matrix_inverse_gpu_{shape}_adjoint_{adjoint}".format(
+                    shape=shape, adjoint=adjoint))
 
 
 if __name__ == "__main__":

From 513def0bb27e4a7c29f6ff533d8ca150b2ab78b4 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Thu, 17 Aug 2017 17:48:53 -0700
Subject: [PATCH 09/70] Fixed BuildOpInfoWithoutDevice

PiperOrigin-RevId: 165653933
---
 tensorflow/core/grappler/costs/BUILD         |  20 ++-
 tensorflow/core/grappler/costs/utils.cc      |  25 +---
 tensorflow/core/grappler/costs/utils_test.cc | 150 +++++++++++++++++++
 3 files changed, 177 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow/core/grappler/costs/utils_test.cc

diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index ea1990c0b19..f2c13d2b132 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -141,6 +141,24 @@ tf_cuda_library(
     ],
 )
 
+cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "cost_estimator",
     hdrs = ["cost_estimator.h"],
@@ -170,7 +188,7 @@ cc_test(
     srcs = ["virtual_placer_test.cc"],
     deps = [
         ":virtual_placer",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 4135d9b3313..db36f97500e 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -70,11 +70,12 @@ static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
   return tensors;
 }
 
+// Annotate the op_info inputs with extra information when possible (e.g. the
+// input value if it's known statically).
 static void ExtractExtraProperties(
     const NodeDef& node,
     const std::unordered_map<string, const NodeDef*>& name_to_node,
-    std::vector<OpInfo::TensorProperties>* extra_inputs,
-    protobuf::Map<string, AttrValue>* attr_map) {
+    OpInfo* op_info) {
   OpRegistry* op_registry = OpRegistry::Global();
   const OpDef* op_def = nullptr;
   auto s = op_registry->LookUpOpDef(node.op(), &op_def);
@@ -102,11 +103,8 @@ static void ExtractExtraProperties(
       if (tensors.empty()) continue;
 
       const TensorProto& t = tensors[0];
-      OpInfo::TensorProperties input;
-      input.set_dtype(t.dtype());
-      *(input.mutable_shape()) = t.tensor_shape();
-      *(input.mutable_value()) = t;
-      extra_inputs->push_back(input);
+      OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
+      *(input->mutable_value()) = t;
 
       // For filename input, the file size can also be useful.
       if (op_def && i < op_def->input_arg_size() &&
@@ -129,7 +127,7 @@ static void ExtractExtraProperties(
         AttrValue attr;
         attr.set_i(stat.length);
         string attr_key = strings::StrCat("input_", i, "_filesize");
-        (*attr_map)[attr_key] = attr;
+        (*op_info->mutable_attr())[attr_key] = attr;
       }
     }
 
@@ -140,7 +138,7 @@ static void ExtractExtraProperties(
       string new_key = strings::StrCat("parent_", i, "_op");
       AttrValue attr;
       attr.set_s(input_node->op());
-      (*attr_map)[new_key] = attr;
+      (*op_info->mutable_attr())[new_key] = attr;
       // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
       // and attributes when necessary.
     }
@@ -212,14 +210,7 @@ OpInfo BuildOpInfoWithoutDevice(
   for (auto& input : inputs) {
     *op_info.add_inputs() = input;
   }
-
-  std::vector<OpInfo::TensorProperties> extra_inputs;
-  ExtractExtraProperties(node, name_to_node, &extra_inputs,
-                         op_info.mutable_attr());
-  for (auto& input : extra_inputs) {
-    *op_info.add_inputs() = input;
-  }
-
+  ExtractExtraProperties(node, name_to_node, &op_info);
   return op_info;
 }
 
diff --git a/tensorflow/core/grappler/costs/utils_test.cc b/tensorflow/core/grappler/costs/utils_test.cc
new file mode 100644
index 00000000000..bdcb156c4e3
--- /dev/null
+++ b/tensorflow/core/grappler/costs/utils_test.cc
@@ -0,0 +1,150 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class UtilsTest : public ::testing::Test {
+ public:
+  void CreateConstOp(const string& name, std::initializer_list<int64> dims,
+                     NodeDef* node) {
+    Tensor tensor(DT_FLOAT, TensorShape(dims));
+    for (int64 i = 0; i < tensor.NumElements(); ++i) {
+      tensor.flat<float>()(i) = i / 10.0f;
+    }
+    TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                    .Attr("dtype", DT_FLOAT)
+                    .Attr("value", tensor)
+                    .Finalize(node));
+  }
+
+  void CreateConstSizesOp(const string& name, const std::vector<int32>& sizes,
+                          NodeDef* node) {
+    TensorShape shape;
+    shape.AddDim(sizes.size());
+    Tensor tensor(DT_INT32, shape);
+    for (int64 i = 0; i < tensor.NumElements(); ++i) {
+      tensor.flat<int32>()(i) = sizes[i];
+    }
+    TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                    .Attr("dtype", DT_INT32)
+                    .Attr("value", tensor)
+                    .Finalize(node));
+  }
+};
+
+TEST_F(UtilsTest, ConvOpInfo) {
+  int batch = 32;
+  int rows = 7;
+  int cols = 9;
+  int filter_rows = 3;
+  int filter_cols = 3;
+  int out_rows = 7;
+  int out_cols = 9;
+  int in_depth = 3;
+  int out_depth = 5;
+  int stride = 1;
+
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  GraphDef graph;
+  NodeDef* input = graph.add_node();
+  name_to_node["input"] = input;
+  CreateConstOp("input", {batch, rows, cols, in_depth}, input);
+  NodeDef* filter = graph.add_node();
+  name_to_node["filter"] = filter;
+  CreateConstOp("filter", {filter_rows, filter_cols, in_depth, out_depth},
+                filter);
+  NodeDef* output_backprop = graph.add_node();
+  name_to_node["output_backprop"] = output_backprop;
+  CreateConstOp("output_backprop", {batch, out_rows, out_cols, out_depth},
+                output_backprop);
+  NodeDef* input_sizes = graph.add_node();
+  name_to_node["input_sizes"] = input;
+  CreateConstSizesOp("input_sizes",
+                     std::vector<int32>({batch, rows, cols, in_depth}),
+                     input_sizes);
+  NodeDef* filter_sizes = graph.add_node();
+  name_to_node["filter_sizes"] = filter_sizes;
+  CreateConstSizesOp(
+      "filter_sizes",
+      std::vector<int32>({filter_rows, filter_cols, in_depth, out_depth}),
+      filter_sizes);
+
+  TensorShape paddings_shape({4, 2});
+  Tensor paddings_tensor(DT_INT32, paddings_shape);
+  for (int64 i = 0; i < paddings_tensor.NumElements(); ++i) {
+    paddings_tensor.flat<int32>()(i) = 0;
+  }
+  TF_CHECK_OK(NodeDefBuilder("paddings", "Const")
+                  .Attr("dtype", DT_INT32)
+                  .Attr("value", paddings_tensor)
+                  .Finalize(graph.add_node()));
+
+  // Now add the convolution op
+  NodeDef* conv = graph.add_node();
+  TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2D")
+                  .Input("input", 0, DT_FLOAT)
+                  .Input("filter", 0, DT_FLOAT)
+                  .Attr("strides", {1, stride, stride, 1})
+                  .Attr("padding", "SAME")
+                  .Finalize(conv));
+
+  NodeDef* conv_bp_in = graph.add_node();
+  TF_CHECK_OK(NodeDefBuilder("conv2d_bp_in", "Conv2DBackpropInput")
+                  .Input("input_sizes", 0, DT_INT32)
+                  .Input("filter", 0, DT_FLOAT)
+                  .Input("output_backprop", 0, DT_FLOAT)
+                  .Attr("strides", {1, stride, stride, 1})
+                  .Attr("padding", "SAME")
+                  .Finalize(conv_bp_in));
+
+  NodeDef* conv_bp_filter = graph.add_node();
+  TF_CHECK_OK(NodeDefBuilder("conv2d_bp_filter", "Conv2DBackpropFilter")
+                  .Input("input", 0, DT_FLOAT)
+                  .Input("filter_sizes", 0, DT_INT32)
+                  .Input("output_backprop", 0, DT_FLOAT)
+                  .Attr("strides", {1, stride, stride, 1})
+                  .Attr("padding", "SAME")
+                  .Finalize(conv_bp_filter));
+
+  for (const auto& node : graph.node()) {
+    if (node.name().find("conv2d") != 0) {
+      continue;
+    }
+    std::vector<OpInfo::TensorProperties> inputs;
+    inputs.resize(node.input_size());
+    OpInfo info = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
+    if (node.name() == "conv2d") {
+      EXPECT_EQ(2, info.inputs_size());
+    } else if (node.name() == "conv2dbp_in") {
+      EXPECT_EQ(3, info.inputs_size());
+    } else if (node.name() == "conv2d_bp_filter") {
+      EXPECT_EQ(3, info.inputs_size());
+    }
+  }
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow

From a1225879cdedae7f2de24030a9c072a516d97040 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Thu, 17 Aug 2017 17:55:08 -0700
Subject: [PATCH 10/70] [XLA] Propagate error code in computation replay tool.

PiperOrigin-RevId: 165654497
---
 tensorflow/compiler/xla/tools/replay_computation.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 6228ca34c08..735c66e2d3e 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -82,9 +82,10 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   return client->ExecuteAndTransfer(computation, execute_arguments);
 }
 
-void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
+int RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
   Client* client = ClientLibrary::LocalClientOrDie();
   tensorflow::Env* env = tensorflow::Env::Default();
+  int exit_status = EXIT_SUCCESS;
   for (char* arg : args) {
     SessionModule module;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(env, arg, &module));
@@ -93,6 +94,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
+      exit_status = EXIT_FAILURE;
       continue;
     }
     std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
@@ -105,6 +107,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
               Literal(module.result()).ToString().c_str());
     }
   }
+  return exit_status;
 }
 
 }  // namespace tools
@@ -126,6 +129,5 @@ int main(int argc, char** argv) {
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args, use_fake_data);
-  return 0;
+  return xla::tools::RealMain(args, use_fake_data);
 }

From f0da8bf56ba1b625d53b760683bc44f67e204199 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 17:56:51 -0700
Subject: [PATCH 11/70] [Rematerialization] Reconsider to remat operations with
 control dependencies

We added a conservartive logic to not rematerialize operations with control dependencies since the rematerialized operations could result in undesired ordering. However, we now realize that when we remat an operation, we also copy the dependencies of them, which guarantees the rematerialized operation has the same constraint as the original operation.

PiperOrigin-RevId: 165654629
---
 .../xla/service/hlo_rematerialization.cc      | 41 ++++++++++++++-----
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 9f65f1b8512..a0e5bb7911b 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -55,16 +55,6 @@ namespace {
 
 // Returns true if the given instruction is rematerializable.
 bool IsRematerializable(const HloInstruction* instruction) {
-  // Conservatively, don't rematerialize instruction with control
-  // dependencies. For one, control dependencies are added to prevent
-  // interference of aliased buffers (say, in while bodies) and
-  // rematerialization is ignorant of liveness and may break the intended
-  // ordering.
-  if (!instruction->control_predecessors().empty() ||
-      !instruction->control_successors().empty()) {
-    return false;
-  }
-
   // Don't rematerialize instructions with side effects or instructions which
   // cannot be cloned safely.
   switch (instruction->opcode()) {
@@ -906,6 +896,19 @@ Item* PickRematerializationCandidate(const MemoryUsageTracker& memory_tracker,
       continue;
     }
 
+    // If any of the candidate's control successor has been placed, we need to
+    // skip this candidate. Otherwise we will violate control dependency.
+    bool control_successor_placed =
+        std::any_of(candidate->control_successors().begin(),
+                    candidate->control_successors().end(),
+                    [&memory_tracker](const HloInstruction* inst) {
+                      return memory_tracker.IsPlaced(inst);
+                    });
+
+    if (control_successor_placed) {
+      continue;
+    }
+
     const int64 memory_reduced =
         memory_tracker.MemoryReducedIfRematerialized(item);
 
@@ -1047,6 +1050,15 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
       HloInstruction* remat =
           computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
+
+      // Add control dependencies to the new operation.
+      for (auto successor : best->control_successors()) {
+        TF_RETURN_IF_ERROR(remat->AddControlDependencyTo(successor));
+      }
+      for (auto predecessor : best->control_predecessors()) {
+        TF_RETURN_IF_ERROR(predecessor->AddControlDependencyTo(remat));
+      }
+
       Item* remat_item = instruction_list.CreateItem(remat);
 
       // Replace each remaining use of 'best' with the rematerialization.
@@ -1082,6 +1094,15 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           }
         }
       }
+      // Insert rematerialized instruction before any of its successors to
+      // preserve ordering regarding control dependency.
+      for (auto successor : remat->control_successors()) {
+        Item* successor_item = instruction_list.GetItem(successor);
+        // Assert to make sure we never remat an operation with control
+        // successor already placed.
+        CHECK(!successor_item->placed);
+        place_before.push_back(successor_item);
+      }
       instruction_list.InsertBeforeInstructions(remat_item, place_before);
 
       // If the rematerialized instruction is dead then rematerialization is

From 7359fec792e4efec1670a12332bb524a5608b215 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 18:04:58 -0700
Subject: [PATCH 12/70] Implement Batchnorm Inference by expanding them into
 smaller ops.

1. Add batch norm inference support in batchnorm_rewriter
2. Connect xla's batchnorm inference to tf's FusedBatchNorm

RELNOTES: n/a
PiperOrigin-RevId: 165655351
---
 .../compiler/tests/fused_batchnorm_test.py    |  33 ++++
 .../compiler/tf2xla/kernels/batch_norm_op.cc  |  42 ++---
 .../xla/client/computation_builder.cc         |  26 +++-
 .../xla/service/batchnorm_rewriter.cc         |  98 +++++++++++-
 .../compiler/xla/service/batchnorm_rewriter.h |   3 +
 .../xla/service/batchnorm_rewriter_test.cc    |   2 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  |   1 +
 .../compiler/xla/service/dfs_hlo_visitor.h    |   3 +
 .../service/dfs_hlo_visitor_with_default.h    |   4 +
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   1 +
 .../compiler/xla/service/hlo_cost_analysis.cc |   6 +
 .../compiler/xla/service/hlo_cost_analysis.h  |   1 +
 .../compiler/xla/service/hlo_graph_dumper.cc  |   1 +
 .../compiler/xla/service/hlo_instruction.cc   |  26 ++++
 .../compiler/xla/service/hlo_instruction.h    |   6 +
 tensorflow/compiler/xla/service/hlo_opcode.cc |   2 +
 tensorflow/compiler/xla/service/hlo_opcode.h  |   1 +
 .../xla/service/instruction_fusion.cc         |   1 +
 tensorflow/compiler/xla/service/service.cc    |   4 +
 .../compiler/xla/service/shape_inference.cc   | 144 ++++++++++++++++++
 .../compiler/xla/service/shape_inference.h    |   7 +
 .../compiler/xla/service/user_computation.cc  | 100 ++++++++++++
 .../compiler/xla/service/user_computation.h   |   4 +
 .../xla/tests/batch_normalization_test.cc     | 103 +++++++++++++
 tensorflow/compiler/xla/xla_data.proto        |  13 +-
 25 files changed, 605 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index f8e9fc92681..936fcf8b6be 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -63,6 +63,39 @@ class FusedBatchNormTest(XLATestCase):
     grad_offset = np.sum(grad_y, axis=(0, 1, 2))
     return grad_x, grad_scale, grad_offset
 
+  def testInference(self):
+    x_shape = [2, 2, 6, 2]
+    scale_shape = [2]
+    x_val = np.random.random_sample(x_shape).astype(np.float32)
+    scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+    offset_val = np.random.random_sample(scale_shape).astype(np.float32)
+    data_format = "NHWC"
+    with self.test_session() as sess, self.test_scope():
+      # To avoid constant folding
+      t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
+      scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
+      offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
+      epsilon = 0.001
+      y_ref, mean_ref, var_ref = self._reference_training(
+          x_val, scale_val, offset_val, epsilon, data_format)
+      y, mean, variance = nn.fused_batch_norm(
+          t_val,
+          scale,
+          offset,
+          mean=mean_ref,
+          variance=var_ref,
+          epsilon=epsilon,
+          data_format=data_format,
+          is_training=False)
+
+      y_val, _, _ = sess.run(
+          [y, mean,
+           variance], {t_val: x_val,
+                       scale: scale_val,
+                       offset: offset_val})
+      self.assertAllClose(y_val, y_ref, atol=1e-3)
+
   def _testLearning(self, use_gradient_checker):
     x_shape = [2, 2, 6, 2]
     scale_shape = [2]
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 3f23e459b98..9d2703bf952 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -39,28 +39,36 @@ class FusedBatchNormOp : public XlaOpKernel {
           errors::InvalidArgument("Not supported format"));
       feature_index_ = GetTensorFeatureDimIndex(/*num_dims=*/4, tensor_format);
     }
-    // TODO(b/62843645): Implement BatchNormInference.
-    OP_REQUIRES(
-        ctx, is_training_,
-        errors::InvalidArgument("Fused batch normalization for inference is "
-                                "not supported yet on XLA backend."));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle output = ctx->builder()->BatchNormTraining(
-        ctx->Input(0), ctx->Input(1), ctx->Input(2), epsilon_, feature_index_);
+    if (is_training_) {
+      xla::ComputationDataHandle output = ctx->builder()->BatchNormTraining(
+          ctx->Input(0), ctx->Input(1), ctx->Input(2), epsilon_,
+          feature_index_);
 
-    // In training mode, outputs the normalized value as well as the calculated
-    // mean and variance.
-    for (int i = 0; i < 3; i++) {
-      ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
+      // In training mode, outputs the normalized value as well as the
+      // calculated mean and variance.
+      for (int i = 0; i < 3; i++) {
+        ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
+      }
+      // Output 3 and 4 for "FusedBatchNorm" are currently marked as "reserved
+      // space 1 & 2". They are used to pass the per-batch mean and
+      // variance to the gradient. Here we maintain the same behavior by setting
+      // them to the mean and variance calculated by BatchNormTraining.
+      ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
+      ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
+    } else {
+      xla::ComputationDataHandle output = ctx->builder()->BatchNormInference(
+          ctx->Input(0), ctx->Input(1), ctx->Input(2), ctx->Input(3),
+          ctx->Input(4), epsilon_, feature_index_);
+      ctx->SetOutput(0, output);
+      // Directly send input to output as mean and variance in inference mode.
+      ctx->SetOutput(1, ctx->Input(3));
+      ctx->SetOutput(2, ctx->Input(4));
+      ctx->SetOutput(3, ctx->Input(3));
+      ctx->SetOutput(4, ctx->Input(4));
     }
-    // Output 3 and 4 for "FusedBatchNorm" are currently marked as "reserved
-    // space 1 & 2". They are used to pass the per-batch mean and
-    // variance to the gradient. Here we maintain the same behavior by setting
-    // them to the mean and variance calculated by BatchNormTraining.
-    ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
-    ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
   }
 
  private:
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index e6ffc4f98de..30afaed7323 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -1477,9 +1477,29 @@ ComputationDataHandle ComputationBuilder::BatchNormInference(
     const ComputationDataHandle& operand, const ComputationDataHandle& scale,
     const ComputationDataHandle& offset, const ComputationDataHandle& mean,
     const ComputationDataHandle& variance, float epsilon, int64 feature_index) {
-  // TODO(b/62843645): Implement BatchNormInference.
-  NoteError(Unimplemented("BatchNormInference is not implemented yet."));
-  return ComputationDataHandle();
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+  BatchNormInferenceRequest request;
+  *request.mutable_operand() = operand;
+  *request.mutable_scale() = scale;
+  *request.mutable_offset() = offset;
+  *request.mutable_mean() = mean;
+  *request.mutable_variance() = variance;
+  request.set_epsilon(epsilon);
+  request.set_feature_index(feature_index);
+
+  OpRequest op_request;
+  *op_request.mutable_batch_norm_inference_request() = request;
+  *op_request.mutable_computation() = computation_.handle();
+  AddOpMetadata(&op_request);
+
+  OpResponse response;
+
+  VLOG(2) << "making BatchNormInference request";
+
+  Status s = client_->stub()->Op(&op_request, &response);
+  return ParseOpResponse(s, &response);
 }
 
 ComputationDataHandle ComputationBuilder::BatchNormGrad(
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
index 721d99301a1..41d32d0c8b1 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
@@ -56,11 +56,14 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
 
+  Status HandleBatchNormInference(HloInstruction* batch_norm) override;
+
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
 
   // Runs the visitor on a computation.
   static bool Run(HloComputation* computation, bool rewrite_training_op,
-                  bool rewrite_grad_op, bool use_fusion);
+                  bool rewrite_inference_op, bool rewrite_grad_op,
+                  bool use_fusion);
 
   // Returns whether any batch norm ops were rewritten.
   const bool changed() const { return changed_; }
@@ -70,9 +73,11 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
  private:
   explicit BatchNormRewriterVisitor(HloComputation* computation,
                                     bool rewrite_training_op,
+                                    bool rewrite_inference_op,
                                     bool rewrite_grad_op, bool use_fusion)
       : computation_(computation),
         rewrite_training_op_(rewrite_training_op),
+        rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op),
         use_fusion_(use_fusion) {}
 
@@ -94,6 +99,7 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
   HloComputation* computation_;
 
   bool rewrite_training_op_;
+  bool rewrite_inference_op_;
   bool rewrite_grad_op_;
   bool use_fusion_;
 
@@ -126,11 +132,14 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
 
 bool BatchNormRewriterVisitor::Run(HloComputation* computation,
                                    bool rewrite_training_op,
+                                   bool rewrite_inference_op,
                                    bool rewrite_grad_op, bool use_fusion) {
-  BatchNormRewriterVisitor visitor(computation,
-                                   /*rewrite_training_op=*/rewrite_training_op,
-                                   /*rewrite_grad_op=*/rewrite_grad_op,
-                                   /*use_fusion=*/use_fusion);
+  BatchNormRewriterVisitor visitor(
+      computation,
+      /*rewrite_training_op=*/rewrite_training_op,
+      /*rewrite_inference_op=*/rewrite_inference_op,
+      /*rewrite_grad_op=*/rewrite_grad_op,
+      /*use_fusion=*/use_fusion);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -268,6 +277,82 @@ Status BatchNormRewriterVisitor::HandleBatchNormTraining(
   return Status::OK();
 }
 
+Status BatchNormRewriterVisitor::HandleBatchNormInference(
+    HloInstruction* batch_norm) {
+  if (!rewrite_inference_op_) {
+    return Status::OK();
+  }
+  // Expand batch norm inference into smaller HLO ops.
+  HloInstruction* operand = batch_norm->mutable_operand(0);
+  const Shape operand_shape = operand->shape();
+  int64 feature_index = batch_norm->feature_index();
+
+  HloInstruction* scale = batch_norm->mutable_operand(1);
+  HloInstruction* offset = batch_norm->mutable_operand(2);
+  HloInstruction* mean = batch_norm->mutable_operand(3);
+  HloInstruction* var = batch_norm->mutable_operand(4);
+  const Shape feature_shape = scale->shape();
+
+  auto epsilon = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+
+  std::vector<int64> dimensions_without_feature;
+
+  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+    if (i != feature_index) {
+      dimensions_without_feature.push_back(i);
+    }
+  }
+
+  auto scale_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
+
+  auto offset_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
+
+  auto mean_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
+
+  auto var_broadcasted = computation_->AddInstruction(
+      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+
+  // Var[X] + epsilon.
+  auto var_add_epsilon =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+
+  auto neg_half = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
+
+  // 1 / Sqrt[Var[X] + epsilon].
+  auto rsqrt_var_add_epsilon =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+
+  // X - E[X].
+  auto operand_minus_mean =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+
+  // (X - E[X]) / Sqrt[Var[X] + epsilon].
+  auto normalized = computation_->AddInstruction(
+      HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
+                                   operand_minus_mean, rsqrt_var_add_epsilon));
+
+  // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
+  auto scaled_normalized =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+
+  // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
+  auto shifted_normalized = HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted);
+
+  TF_CHECK_OK(
+      ReplaceWithNewInstruction(batch_norm, std::move(shifted_normalized)));
+  return Status::OK();
+}
+
 Status BatchNormRewriterVisitor::HandleBatchNormGrad(
     HloInstruction* batch_norm) {
   // Use the following formulas to calculate gradients:
@@ -457,7 +542,8 @@ StatusOr<bool> BatchNormRewriter::Run(HloModule* module) {
   }
   for (auto& comp : computations) {
     if (BatchNormRewriterVisitor::Run(comp, rewrite_training_op_,
-                                      rewrite_grad_op_, use_fusion_)) {
+                                      rewrite_inference_op_, rewrite_grad_op_,
+                                      use_fusion_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.h b/tensorflow/compiler/xla/service/batchnorm_rewriter.h
index d3ffb31032e..f601741d964 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.h
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter.h
@@ -30,8 +30,10 @@ class BatchNormRewriter : public HloPassInterface {
  public:
   // When use_fusion is set, a multi-output fusion node is created.
   BatchNormRewriter(bool rewrite_training_op = false,
+                    bool rewrite_inference_op = false,
                     bool rewrite_grad_op = false, bool use_fusion = true)
       : rewrite_training_op_(rewrite_training_op),
+        rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op),
         use_fusion_(use_fusion) {}
   ~BatchNormRewriter() = default;
@@ -43,6 +45,7 @@ class BatchNormRewriter : public HloPassInterface {
 
  private:
   bool rewrite_training_op_;
+  bool rewrite_inference_op_;
   bool rewrite_grad_op_;
   bool use_fusion_;
 };
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc b/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
index cc8dffcda51..07775623e75 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
@@ -64,6 +64,7 @@ TEST_F(BatchNormRewriterTest, BatchNormTraining) {
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormTraining);
   BatchNormRewriter rewriter(/*rewrite_training_op=*/true,
+                             /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
@@ -105,6 +106,7 @@ TEST_F(BatchNormRewriterTest, BatchNormGrad) {
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormGrad);
   BatchNormRewriter rewriter(/*rewrite_training_op=*/true,
+                             /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index eca9b0f4bef..8a37c8108ea 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -260,6 +260,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module) {
         pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
     pass.AddPass<BatchNormRewriter>(
         /*rewrite_training_op=*/true,
+        /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true,
         /*use_fusion=*/false);
     pass.AddPass<AlgebraicSimplifier>(
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index e450b31ff18..4baa56658f7 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -228,6 +228,9 @@ class DfsHloVisitor {
 
   virtual Status HandleBatchNormTraining(HloInstruction* batchNormTraining) = 0;
 
+  virtual Status HandleBatchNormInference(
+      HloInstruction* batchNormInference) = 0;
+
   virtual Status HandleBatchNormGrad(HloInstruction* batchNormGrad) = 0;
 
   // Invoked to inform the visitor that the traversal has completed, and that
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index c447165cecc..10f8ae9b044 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -54,6 +54,10 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
     return DefaultAction(hlo);
   }
 
+  Status HandleBatchNormInference(HloInstruction* hlo) override {
+    return DefaultAction(hlo);
+  }
+
   Status HandleBatchNormGrad(HloInstruction* hlo) override {
     return DefaultAction(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 2a7486af881..cd913a4b5d6 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -135,6 +135,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
       // instead.
       pass.AddPass<BatchNormRewriter>(
           /*rewrite_training_op=*/true,
+          /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true,
           /*use_fusion=*/false);
       pass.AddPass<AlgebraicSimplifier>(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index d113ca2a76b..9dbde0ec243 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -374,6 +374,12 @@ Status HloCostAnalysis::HandleBatchNormTraining(
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleBatchNormInference(
+    HloInstruction* batchNormInference) {
+  // TODO(b/62294698): Implement cost analysis for batch-norm-inference.
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleBatchNormGrad(HloInstruction* batchNormGrad) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-grad.
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index ec48c8a0fd8..6d8fdfa64b5 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -89,6 +89,7 @@ class HloCostAnalysis : public DfsHloVisitor {
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       HloComputation* function_handle) override;
   Status HandleBatchNormTraining(HloInstruction* batchNormTraining) override;
+  Status HandleBatchNormInference(HloInstruction* batchNormInference) override;
   Status HandleBatchNormGrad(HloInstruction* batchNormGrad) override;
   Status HandleFusion(HloInstruction* fusion) override;
   Status HandleCall(HloInstruction* call) override;
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index d1c31963665..38b1291d440 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -742,6 +742,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kParameter:
       return kOrange;
     case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kReduce:
     case HloOpcode::kSelectAndScatter:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 825f3f8f60e..fb9dbd64216 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -406,6 +406,23 @@ HloInstruction::CreateBatchNormTraining(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBatchNormInference(
+    const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+    HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
+    float epsilon, int64 feature_index) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBatchNormInference, shape));
+  instruction->AppendOperand(operand);
+  instruction->AppendOperand(scale);
+  instruction->AppendOperand(offset);
+  instruction->AppendOperand(mean);
+  instruction->AppendOperand(variance);
+  instruction->epsilon_ = epsilon;
+  instruction->feature_index_ = feature_index;
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBatchNormGrad(const Shape& shape, HloInstruction* operand,
                                     HloInstruction* scale, HloInstruction* mean,
@@ -1065,6 +1082,12 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       return CreateBatchNormTraining(shape, new_operands[0], new_operands[1],
                                      new_operands[2], epsilon(),
                                      feature_index());
+
+    case HloOpcode::kBatchNormInference:
+      CHECK_EQ(new_operands.size(), 5);
+      return CreateBatchNormInference(
+          shape, new_operands[0], new_operands[1], new_operands[2],
+          new_operands[3], new_operands[4], epsilon(), feature_index());
     case HloOpcode::kInfeed:
       CHECK_EQ(new_operands.size(), 0);
       return CreateInfeed(shape, infeed_config());
@@ -1355,6 +1378,7 @@ bool HloInstruction::IdenticalSlowPath(
              ShapeUtil::Compatible(shape(), other.shape());
 
     case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
       return feature_index() == other.feature_index() &&
              epsilon() == other.epsilon();
@@ -1952,6 +1976,8 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
       return visitor->HandleAbs(this, operands_[0]);
     case HloOpcode::kBatchNormTraining:
       return visitor->HandleBatchNormTraining(this);
+    case HloOpcode::kBatchNormInference:
+      return visitor->HandleBatchNormInference(this);
     case HloOpcode::kBatchNormGrad:
       return visitor->HandleBatchNormGrad(this);
     case HloOpcode::kSign:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index f2005380d8e..d246720b3cf 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -224,6 +224,12 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, HloInstruction* scale,
       HloInstruction* offset, float epsilon, int64 feature_index);
 
+  // Creates a batch-norm-inference instruction.
+  static std::unique_ptr<HloInstruction> CreateBatchNormInference(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
+      float epsilon, int64 feature_index);
+
   // Creates a batch-norm-grad instruction.
   static std::unique_ptr<HloInstruction> CreateBatchNormGrad(
       const Shape& shape, HloInstruction* operand, HloInstruction* scale,
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index 3888f757ada..314512d0a8d 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -33,6 +33,8 @@ string HloOpcodeString(HloOpcode opcode) {
       return "add";
     case HloOpcode::kBatchNormTraining:
       return "batch-norm-training";
+    case HloOpcode::kBatchNormInference:
+      return "batch-norm-inference";
     case HloOpcode::kBatchNormGrad:
       return "batch-norm-grad";
     case HloOpcode::kBitcast:
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 8a6376b2d1c..c4d5efad903 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -31,6 +31,7 @@ enum class HloOpcode {
   kAbs,
   kAdd,
   kBatchNormTraining,
+  kBatchNormInference,
   kBatchNormGrad,
   kBitcast,
   kBroadcast,
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 4333db17e75..edfcb0922d6 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -78,6 +78,7 @@ namespace xla {
 
     // Expensive instructions.
     case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kCall:
     case HloOpcode::kConvolution:
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index ad2d5235f8d..d63d33ecb00 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -1211,6 +1211,10 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddBatchNormTrainingInstruction(
           arg->batch_norm_training_request());
       break;
+    case OpRequest::kBatchNormInferenceRequest:
+      handle_status = computation->AddBatchNormInferenceInstruction(
+          arg->batch_norm_inference_request());
+      break;
     case OpRequest::kBatchNormGradRequest:
       handle_status = computation->AddBatchNormGradInstruction(
           arg->batch_norm_grad_request());
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 2c2b0cca5fd..8eeb1cd5d20 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -885,6 +885,150 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                                     output_shape_for_mean_and_var});
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferBatchNormInferenceShape(
+    const Shape& operand_shape, const Shape& offset_shape,
+    const Shape& scale_shape, const Shape& mean_shape,
+    const Shape& variance_shape, int64 feature_index) {
+  TF_RETURN_IF_ERROR(
+      ExpectNotTupleOrOpaque(operand_shape, "operand of batch norm inference"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
+      offset_shape, "offset input of batch norm inference"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
+      scale_shape, "scale input of batch norm inference"));
+
+  TF_RET_CHECK(ShapeUtil::ValidateShape(operand_shape) ==
+               tensorflow::Status::OK());
+  TF_RET_CHECK(ShapeUtil::ValidateShape(offset_shape) ==
+               tensorflow::Status::OK());
+  TF_RET_CHECK(ShapeUtil::ValidateShape(scale_shape) ==
+               tensorflow::Status::OK());
+  TF_RET_CHECK(ShapeUtil::ValidateShape(mean_shape) ==
+               tensorflow::Status::OK());
+  TF_RET_CHECK(ShapeUtil::ValidateShape(variance_shape) ==
+               tensorflow::Status::OK());
+
+  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+    return InvalidArgument(
+        "Expected feature_index of batch-norm-inference to be "
+        "smaller than the rank of operand_shape; "
+        "got feature_index %lld, and rank %lld",
+        feature_index, ShapeUtil::Rank(operand_shape));
+  }
+
+  if (feature_index < 0) {
+    return InvalidArgument(
+        "Expected feature_index of batch-norm-inference to "
+        "be a non-negative number, got %lld",
+        feature_index);
+  }
+
+  if (ShapeUtil::Rank(operand_shape) < 1) {
+    return InvalidArgument(
+        "Expected the rank of operand to "
+        "batch-norm-inference to be at least 1; got %lld",
+        ShapeUtil::Rank(operand_shape));
+  }
+
+  if (ShapeUtil::Rank(offset_shape) != 1) {
+    return InvalidArgument(
+        "Offset input of batch-norm-inference must have"
+        " rank 1, but has rank %lld.",
+        ShapeUtil::Rank(offset_shape));
+  }
+
+  if (ShapeUtil::Rank(scale_shape) != 1) {
+    return InvalidArgument(
+        "Scale input of batch-norm-inference must have"
+        " rank 1, but has rank %lld.",
+        ShapeUtil::Rank(scale_shape));
+  }
+
+  if (!ShapeUtil::ElementIsFloating(operand_shape)) {
+    return InvalidArgument(
+        "The operand to batch-norm-inference must have a floating point "
+        "element type, but the shape is %s",
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(offset_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for "
+        "batch-norm-inference, "
+        "but the shape of offset factor is %s "
+        "and the shape of operand is %s",
+        PrimitiveType_Name(offset_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(scale_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for "
+        "batch-norm-inference, "
+        "but the shape of scale factor is %s "
+        "and the shape of operand is %s",
+        PrimitiveType_Name(scale_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(mean_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for "
+        "batch-norm-inference, "
+        "but the shape of mean is %s "
+        "and the shape of operand is %s",
+        PrimitiveType_Name(mean_shape.element_type()).c_str(),
+        PrimitiveType_Name(operand_shape.element_type()).c_str());
+  }
+
+  if (!ShapeUtil::SameElementType(variance_shape, operand_shape)) {
+    return InvalidArgument(
+        "The inputs should have the same element type for "
+        "batch-norm-inference, "
+        "but the shape of variance is %s "
+        "and the shape of operand is %s",
+        PrimitiveType_Name(mean_shape.element_type()).c_str(),
+        PrimitiveType_Name(variance_shape.element_type()).c_str());
+  }
+
+  const int64 feature_count = operand_shape.dimensions(feature_index);
+  Shape output_shape_for_mean_and_var =
+      ShapeUtil::MakeShape(operand_shape.element_type(), {feature_count});
+
+  if (ShapeUtil::GetDimension(offset_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of offset factor should be the same as feature count,"
+        "but the size of offset factor is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(offset_shape, 0), feature_count);
+  }
+
+  if (ShapeUtil::GetDimension(scale_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of scale factor should be the same as feature count,"
+        "but the size of scale factor is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(scale_shape, 0), feature_count);
+  }
+
+  if (ShapeUtil::GetDimension(mean_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of mean should be the same as feature count,"
+        "but the size of mean is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(mean_shape, 0), feature_count);
+  }
+
+  if (ShapeUtil::GetDimension(variance_shape, 0) != feature_count) {
+    return InvalidArgument(
+        "The size of variance should be the same as feature count,"
+        "but the size of variance is %lld "
+        "and the feature count is %lld",
+        ShapeUtil::GetDimension(variance_shape, 0), feature_count);
+  }
+
+  return operand_shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferBatchNormGradShape(
     const Shape& operand_shape, const Shape& scale_shape,
     const Shape& mean_shape, const Shape& var_shape,
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index f3f0176a434..5d55df92a91 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -71,6 +71,13 @@ class ShapeInference {
                                                      const Shape& scale_shape,
                                                      int64 feature_index);
 
+  // Infers the shape produced by InferBatchNormInference with the given
+  // operands.
+  static StatusOr<Shape> InferBatchNormInferenceShape(
+      const Shape& operand_shape, const Shape& offset_shape,
+      const Shape& scale_shape, const Shape& mean_shape,
+      const Shape& variance_shape, int64 feature_index);
+
   // Infers the shape produced by InferBatchNormGrad with the given operands.
   static StatusOr<Shape> InferBatchNormGradShape(const Shape& operand_shape,
                                                  const Shape& scale_shape,
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 3b280c97278..cfa5c98f593 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -507,6 +507,53 @@ UserComputation::AddBatchNormTrainingInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle>
+UserComputation::AddBatchNormInferenceInstruction(
+    const BatchNormInferenceRequest& batch_norm_inference_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(batch_norm_inference_request.operand()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
+                      LookUpRequest(batch_norm_inference_request.scale()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* offset,
+                      LookUpRequest(batch_norm_inference_request.offset()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* mean,
+                      LookUpRequest(batch_norm_inference_request.mean()));
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* variance,
+                      LookUpRequest(batch_norm_inference_request.variance()));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+
+  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                      ShapeInference::InferBatchNormInferenceShape(
+                          operand->output_shape(), scale->output_shape(),
+                          offset->output_shape(), mean->output_shape(),
+                          variance->output_shape(),
+                          batch_norm_inference_request.feature_index()));
+
+  *request.mutable_output_shape() = inferred_shape;
+
+  *request.mutable_output_handle() = handle;
+
+  *request.mutable_request()->mutable_batch_norm_inference_request() =
+      batch_norm_inference_request;
+
+  VLOG(1) << "AddBatchNormInferenceInstruction ("
+          << GetVersionedHandleInternal() << "), data handle "
+          << handle.handle() << ": "
+          << batch_norm_inference_request.ShortDebugString();
+
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddBatchNormGradInstruction(
     const BatchNormGradRequest& batch_norm_grad_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1678,6 +1725,25 @@ void ConstantVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kBatchNormInferenceRequest: {
+      const BatchNormInferenceRequest& batch_norm_inference_request =
+          request.request().batch_norm_inference_request();
+      ConstantVisitor(session_computation,
+                      batch_norm_inference_request.operand(), visited,
+                      is_constant);
+      ConstantVisitor(session_computation, batch_norm_inference_request.scale(),
+                      visited, is_constant);
+      ConstantVisitor(session_computation,
+                      batch_norm_inference_request.offset(), visited,
+                      is_constant);
+      ConstantVisitor(session_computation, batch_norm_inference_request.mean(),
+                      visited, is_constant);
+      ConstantVisitor(session_computation,
+                      batch_norm_inference_request.variance(), visited,
+                      is_constant);
+      break;
+    }
+
     case OpRequest::kBatchNormGradRequest: {
       const BatchNormGradRequest& batch_norm_grad_request =
           request.request().batch_norm_grad_request();
@@ -2119,6 +2185,18 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kBatchNormInferenceRequest: {
+      const BatchNormInferenceRequest& batch_norm_inference_request =
+          request.request().batch_norm_inference_request();
+
+      apply(batch_norm_inference_request.operand());
+      apply(batch_norm_inference_request.scale());
+      apply(batch_norm_inference_request.offset());
+      apply(batch_norm_inference_request.mean());
+      apply(batch_norm_inference_request.variance());
+      break;
+    }
+
     case OpRequest::kBatchNormGradRequest: {
       const BatchNormGradRequest& batch_norm_grad_request =
           request.request().batch_norm_grad_request();
@@ -2647,6 +2725,28 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kBatchNormInferenceRequest: {
+      const BatchNormInferenceRequest& batch_norm_inference_request =
+          request.request().batch_norm_inference_request();
+      HloInstruction* operand =
+          lookup_instruction(batch_norm_inference_request.operand());
+      HloInstruction* scale =
+          lookup_instruction(batch_norm_inference_request.scale());
+      HloInstruction* offset =
+          lookup_instruction(batch_norm_inference_request.offset());
+      HloInstruction* mean =
+          lookup_instruction(batch_norm_inference_request.mean());
+      HloInstruction* variance =
+          lookup_instruction(batch_norm_inference_request.variance());
+
+      hlo_instruction =
+          add_instruction(HloInstruction::CreateBatchNormInference(
+              request.output_shape(), operand, scale, offset, mean, variance,
+              batch_norm_inference_request.epsilon(),
+              batch_norm_inference_request.feature_index()));
+      break;
+    }
+
     case OpRequest::kBatchNormGradRequest: {
       const BatchNormGradRequest& batch_norm_grad_request =
           request.request().batch_norm_grad_request();
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index 36b1d34e05d..b779b1f76c8 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -89,6 +89,10 @@ class UserComputation {
   StatusOr<ComputationDataHandle> AddBatchNormTrainingInstruction(
       const BatchNormTrainingRequest& batch_norm_training_request);
 
+  // Enqueues a batch norm inference instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddBatchNormInferenceInstruction(
+      const BatchNormInferenceRequest& batch_norm_inference_request);
+
   // Enqueues a batch norm grad instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddBatchNormGradInstruction(
       const BatchNormGradRequest& batch_norm_grad_request);
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 34b3abb8c75..028d1251b45 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -306,6 +306,109 @@ XLA_TEST_P(BatchNormTest, RandomizedTests) {
       ErrorSpec(0.01, 1));
 }
 
+XLA_TEST_P(BatchNormTest, RandomizedInferencingTests) {
+  float epsilon = 0.001;
+  ComputationBuilder builder(client_, TestName());
+  const std::vector<int64>& bounds = GetParam().bounds;
+  Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
+  input_array.FillRandom(GetParam().random_value_var,
+                         GetParam().random_value_mean);
+
+  const int64 feature_index = GetParam().feature_index;
+  const int64 num_elements_per_feature =
+      Product(bounds) / bounds[feature_index];
+  const int64 feature_bound = bounds[feature_index];
+  std::vector<float> offset(feature_bound, 1);
+  std::vector<float> scale(feature_bound, 2);
+
+  auto input_squared =
+      ReferenceUtil::MapArray4D(input_array, [](float a) { return a * a; });
+  std::vector<int64> reduce_dims;
+  for (int64 i = 0; i < static_cast<int64>(bounds.size()); ++i) {
+    if (i != feature_index) {
+      reduce_dims.push_back(i);
+    }
+  }
+
+  auto sum =
+      ReferenceUtil::Reduce4DTo1D(input_array, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  auto sum_squared =
+      ReferenceUtil::Reduce4DTo1D(*input_squared, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  std::vector<float> mean(feature_bound);
+
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean[i] = sum[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> mean_square(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean_square[i] = mean[i] * mean[i];
+  }
+
+  std::vector<float> square_mean(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    square_mean[i] = sum_squared[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> var(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    var[i] = square_mean[i] - mean_square[i];
+  }
+
+  Array4D<float> mean4D =
+      *ReferenceUtil::Broadcast1DTo4D(mean, bounds, feature_index);
+  auto var4D = *ReferenceUtil::Broadcast1DTo4D(var, bounds, feature_index);
+  auto scale4D = *ReferenceUtil::Broadcast1DTo4D(scale, bounds, feature_index);
+  auto offset4D =
+      *ReferenceUtil::Broadcast1DTo4D(offset, bounds, feature_index);
+
+  auto normalized = *ReferenceUtil::BatchNorm4D(input_array, mean4D, var4D,
+                                                scale4D, offset4D, epsilon);
+
+  auto offset_literal = Literal::CreateR1<float>(offset);
+  auto scale_literal = Literal::CreateR1<float>(scale);
+  auto mean_literal = Literal::CreateR1<float>(mean);
+  auto var_literal = Literal::CreateR1<float>(var);
+  auto input_literal = Literal::CreateR4FromArray4D<float>(input_array);
+
+  auto input_activations =
+      builder.Parameter(0, input_literal->shape(), "input");
+  auto scale_activations =
+      builder.Parameter(1, scale_literal->shape(), "offset");
+  auto offset_activations =
+      builder.Parameter(2, offset_literal->shape(), "scale");
+  auto mean_activations = builder.Parameter(3, mean_literal->shape(), "mean");
+  auto variance_activations =
+      builder.Parameter(4, var_literal->shape(), "variance");
+
+  Array4D<float> expected = normalized;
+
+  std::unique_ptr<GlobalData> input_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> scale_data =
+      client_->TransferToServer(*scale_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> offset_data =
+      client_->TransferToServer(*offset_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> mean_data =
+      client_->TransferToServer(*mean_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> variance_data =
+      client_->TransferToServer(*var_literal).ConsumeValueOrDie();
+
+  builder.BatchNormInference(input_activations, scale_activations,
+                             offset_activations, mean_activations,
+                             variance_activations, epsilon, feature_index);
+
+  ComputeAndCompareR4<float>(
+      &builder, expected,
+      {input_data.get(), scale_data.get(), offset_data.get(), mean_data.get(),
+       variance_data.get()},
+      ErrorSpec(0.01, 1));
+}
+
 XLA_TEST_P(BatchNormTest, RandomizedGradTests) {
   float epsilon = 0.001;
   ComputationBuilder builder(client_, TestName());
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 38e6675ab7e..185ca7e681c 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -491,6 +491,16 @@ message BatchNormTrainingRequest {
   int64 feature_index = 5;
 }
 
+message BatchNormInferenceRequest {
+  ComputationDataHandle operand = 1;
+  ComputationDataHandle scale = 2;
+  ComputationDataHandle offset = 3;
+  ComputationDataHandle mean = 4;
+  ComputationDataHandle variance = 5;
+  float epsilon = 6;
+  int64 feature_index = 7;
+}
+
 message BatchNormGradRequest {
   ComputationDataHandle operand = 1;
   ComputationDataHandle scale = 2;
@@ -813,7 +823,8 @@ message OpRequest {
     OutfeedRequest outfeed_request = 32;
     BatchNormTrainingRequest batch_norm_training_request = 35;
     BatchNormGradRequest batch_norm_grad_request = 37;
-    // Next: 38
+    BatchNormInferenceRequest batch_norm_inference_request = 38;
+    // Next: 39
   }
 }
 

From 00594ecdd685a2b1eaebb3bcc6b9764bfd4ae5d6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 17 Aug 2017 19:27:58 -0700
Subject: [PATCH 13/70] New landing page and leftnav for Programmer's Guide.

PiperOrigin-RevId: 165660897
---
 .../docs_src/programmers_guide/dims_types.md  | 69 -----------------
 tensorflow/docs_src/programmers_guide/faq.md  | 51 +++++--------
 .../docs_src/programmers_guide/index.md       | 75 ++++++++++---------
 .../docs_src/programmers_guide/leftnav_files  |  6 +-
 4 files changed, 63 insertions(+), 138 deletions(-)
 delete mode 100644 tensorflow/docs_src/programmers_guide/dims_types.md

diff --git a/tensorflow/docs_src/programmers_guide/dims_types.md b/tensorflow/docs_src/programmers_guide/dims_types.md
deleted file mode 100644
index 65b748d56ec..00000000000
--- a/tensorflow/docs_src/programmers_guide/dims_types.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Tensor Ranks, Shapes, and Types
-
-TensorFlow programs use a tensor data structure to represent all data. You can
-think of a TensorFlow tensor as an n-dimensional array or list.
-A tensor has a static type and dynamic dimensions. Only tensors may be passed
-between nodes in the computation graph.
-
-## Rank
-
-In the TensorFlow system, tensors are described by a unit of dimensionality
-known as *rank*. Tensor rank is not the same as matrix rank. Tensor rank
-(sometimes referred to as *order* or *degree* or *n-dimension*) is the number
-of dimensions of the tensor. For example, the following tensor (defined as a
-Python list) has a rank of 2:
-
-    t = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
-
-A rank two tensor is what we typically think of as a matrix, a rank one tensor
-is a vector. For a rank two tensor you can access any element with the syntax
-`t[i, j]`.  For a rank three tensor you would need to address an element with
-`t[i, j, k]`.
-
-Rank | Math entity | Python example
---- | --- | ---
-0 | Scalar (magnitude only) | `s = 483`
-1 | Vector (magnitude and direction) | `v = [1.1, 2.2, 3.3]`
-2 | Matrix (table of numbers) | `m = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]`
-3 | 3-Tensor (cube of numbers) | `t = [[[2], [4], [6]], [[8], [10], [12]], [[14], [16], [18]]]`
-n | n-Tensor (you get the idea) | `....`
-
-## Shape
-
-The TensorFlow documentation uses three notational conventions to describe
-tensor dimensionality: rank, shape, and dimension number. The following table
-shows how these relate to one another:
-
-Rank | Shape | Dimension number | Example
---- | --- | --- | ---
-0 | [] | 0-D | A 0-D tensor.  A scalar.
-1 | [D0] | 1-D | A 1-D tensor with shape [5].
-2 | [D0, D1] | 2-D | A 2-D tensor with shape [3, 4].
-3 | [D0, D1, D2] | 3-D | A 3-D tensor with shape [1, 4, 3].
-n | [D0, D1, ... Dn-1] | n-D | A tensor with shape [D0, D1, ... Dn-1].
-
-Shapes can be represented via Python lists / tuples of ints, or with the
-@{tf.TensorShape}.
-
-## Data types
-
-In addition to dimensionality, Tensors have a data type. You can assign any one
-of the following data types to a tensor:
-
-Data type | Python type | Description
---- | --- | ---
-`DT_FLOAT` | `tf.float32` | 32 bits floating point.
-`DT_DOUBLE` | `tf.float64` | 64 bits floating point.
-`DT_INT8` | `tf.int8` | 8 bits signed integer.
-`DT_INT16` | `tf.int16` | 16 bits signed integer.
-`DT_INT32` | `tf.int32` | 32 bits signed integer.
-`DT_INT64` | `tf.int64` | 64 bits signed integer.
-`DT_UINT8` | `tf.uint8` | 8 bits unsigned integer.
-`DT_UINT16` | `tf.uint16` | 16 bits unsigned integer.
-`DT_STRING` | `tf.string` | Variable length byte arrays.  Each element of a Tensor is a byte array.
-`DT_BOOL` | `tf.bool` | Boolean.
-`DT_COMPLEX64` | `tf.complex64` | Complex number made of two 32 bits floating points: real and imaginary parts.
-`DT_COMPLEX128` | `tf.complex128` | Complex number made of two 64 bits floating points: real and imaginary parts.
-`DT_QINT8` | `tf.qint8` | 8 bits signed integer used in quantized Ops.
-`DT_QINT32` | `tf.qint32` | 32 bits signed integer used in quantized Ops.
-`DT_QUINT8` | `tf.quint8` | 8 bits unsigned integer used in quantized Ops.
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 56486a48b7a..865016dc02d 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -53,10 +53,6 @@ TensorFlow assigns operations to devices, and the
 @{$deep_cnn$CIFAR-10 tutorial} for an example model that
 uses multiple GPUs.
 
-#### What are the different types of tensors that are available?
-
-TensorFlow supports a variety of different data types and tensor shapes. See the
-@{$dims_types$ranks, shapes, and types reference} for more details.
 
 ## Running a TensorFlow computation
 
@@ -171,7 +167,8 @@ available. These operations allow you to build sophisticated
 @{$reading_data$input pipelines}, at the cost of making the
 TensorFlow computation somewhat more complicated. See the how-to documentation
 for
-@{$reading_data#creating-threads-to-prefetch-using-queuerunner-objects$using `QueueRunner` objects to drive queues and readers}
+@{$reading_data#creating-threads-to-prefetch-using-queuerunner-objects$using
+`QueueRunner` objects to drive queues and readers}
 for more information on how to use them.
 
 ## Variables
@@ -240,11 +237,6 @@ to encode the batch size as a Python constant, but instead to use a symbolic
 * Use @{tf.reduce_mean} instead
   of `tf.reduce_sum(...) / batch_size`.
 
-* If you use
-  @{$reading_data#feeding$placeholders for feeding input},
-  you can specify a variable batch dimension by creating the placeholder with
-  [`tf.placeholder(..., shape=[None, ...])`](../api_docs/python/io_ops.md#placeholder). The
-  `None` element of the shape corresponds to a variable-sized dimension.
 
 ## TensorBoard
 
@@ -269,36 +261,33 @@ the flag --host=localhost. This should quiet any security warnings.
 
 ## Extending TensorFlow
 
-See also the how-to documentation for
+See the how-to documentation for
 @{$adding_an_op$adding a new operation to TensorFlow}.
 
 #### My data is in a custom format. How do I read it using TensorFlow?
 
-There are two main options for dealing with data in a custom format.
+There are three main options for dealing with data in a custom format.
 
-The easier option is to write parsing code in Python that transforms the data
-into a numpy array, then feed a
-@{tf.placeholder} a tensor with
-that data. See the documentation on
-@{$reading_data#feeding$using placeholders for input} for
-more details. This approach is easy to get up and running, but the parsing can
-be a performance bottleneck.
+The easiest option is to write parsing code in Python that transforms the data
+into a numpy array. Then use @{tf.contrib.data.Dataset.from_tensor_slices} to
+create an input pipeline from the in-memory data.
 
-The more efficient option is to
+If your data doesn't fit in memory, try doing the parsing in the Dataset
+pipeline. Start with an appropriate file reader, like
+@{tf.contrib.data.TextLineDataset}. Then convert the dataset by mapping
+@{tf.contrib.data.Dataset.map$mapping} appropriate operations over it.
+Prefer predefined TensorFlow operations such as @{tf.decode_raw},
+@{tf.decode_csv}, @{tf.parse_example}, or @{tf.image.decode_png}.
+
+If your data is not easily parsable with the built-in TensorFlow operations,
+consider converting it, offline, to a format that is easily parsable, such
+as ${tf.python_io.TFRecordWriter$`TFRecord`} format.
+
+The more efficient method to customize the parsing behavior is to
 @{$adding_an_op$add a new op written in C++} that parses your
-data format. The
-@{$new_data_formats$guide to handling new data formats} has
+data format. The @{$new_data_formats$guide to handling new data formats} has
 more information about the steps for doing this.
 
-#### How do I define an operation that takes a variable number of inputs?
-
-The TensorFlow op registration mechanism allows you to define inputs that are a
-single tensor, a list of tensors with the same type (for example when adding
-together a variable-length list of tensors), or a list of tensors with different
-types (for example when enqueuing a tuple of tensors to a queue).  See the
-how-to documentation for
-@{$adding_an_op#list-inputs-and-outputs$adding an op with a list of inputs or outputs}
-for more details of how to define these different input types.
 
 ## Miscellaneous
 
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index aa2e12504dd..214f3028e07 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -1,38 +1,45 @@
 # Programmer's Guide
 
 The documents in this unit dive into the details of writing TensorFlow
-code.  This section begins with the following guides, each of which
-explain a particular aspect of TensorFlow:
+code.  For TensorFlow 1.3, we revised this document extensively.
+The units are now as follows:
 
-  * @{$variables$Variables: Creation, Initialization, Saving, Loading, and
-     Sharing}, which details the mechanics of TensorFlow Variables.
-  * @{$dims_types$Tensor Ranks, Shapes, and Types}, which explains Tensor
-    rank (the number of dimensions), shape (the size of each dimension),
-    and datatypes.
-  * @{$threading_and_queues$Threading and Queues}, which explains TensorFlow's
-    rich queuing system.
-  * @{$reading_data$Reading Data}, which documents three different mechanisms
-    for getting data into a TensorFlow program.
-
-The following guide is helpful when training a complex model over multiple
-days:
-
-  * @{$supervisor$Supervisor: Training Helper for Days-Long Trainings}, which
-    explains how to gracefully handle system crashes during a lengthy training
-    session.
-
-TensorFlow provides a debugger named `tfdbg`, which is documented in the
-following guide:
-
-  * @{$debugger$Debugging TensorFlow Programs},
-    which walks you through the use of `tfdbg` within an application. It covers
-    using `tfdbg` with both the low-level TensorFlow API and the Estimator API.
-
-To learn about the TensorFlow versioning scheme consult:
-
-  * @{$version_compat$The TensorFlow Version Compatibility Guide}, which explains
-TensorFlow's versioning nomenclature and compatibility rules.
-
-We conclude this section with a FAQ about TensorFlow programming:
-
-  * @{$faq$Frequently Asked Questions}
+  * @{$programmers_guide/tensors$Tensors}, which explains how to create,
+    manipulate, and access Tensors--the fundamental object in TensorFlow.
+  * @{$programmers_guide/variables$Variables}, which details how
+    to represent shared, persistent state in your program.
+  * @{$programmers_guide/graphs$Graphs and Sessions}, which explains:
+      * dataflow graphs, which are TensorFlow's representation of computations
+        as dependencies between operations.
+      * sessions, which are TensorFlow's mechanism for running dataflow graphs
+        across one or more local or remote devices.
+    If you are programming with the low-level TensorFlow API, this unit
+    is essential. If you are programming with a high-level TensorFlow API
+    such as Estimators or Keras, the high-level API creates and manages
+    graphs and sessions for you, but understanding graphs and sessions
+    can still be helpful.
+  * @{$programmers_guide/estimators$Estimators}, which introduces a high-level
+    TensorFlow API that greatly simplifies ML programming.
+  * @{$programmers_guide/saved_model$Saving and Restoring}, which
+    explains how to save and restore variables and models.
+  * @{$programmers_guide/datasets$Input Pipelines}, which explains how to
+    set up data pipelines to read data sets into your TensorFlow program.
+  * @{$programmers_guide/threading_and_queues$Threading and Queues}, which
+    explains TensorFlow's older system for multi-threaded, queue-based input
+    pipelines. Beginning with TensorFlow 1.2, we recommend using the
+    `tf.contrib.data` module instead, which is documented in the
+    "Input Pipelines" unit.
+  * @{$programmers_guide/embedding$Embeddings}, which introduces the concept
+    of embeddings, provides a simple example of training an embedding in
+    TensorFlow, and explains how to view embeddings with the TensorBoard
+    Embedding Projector.
+  * @{$programmers_guide/debugger$Debugging TensorFlow Programs}, which
+    explains how to use the TensorFlow debugger (tfdbg).
+  * @{$programmers_guide/supervisor$Supervisor: Training Helper for Days-Long Trainings},
+    which explains how to gracefully handle system crashes during lengthy
+    training sessions.  (We have not revised this document for v1.3.)
+  * @{$programmers_guide/version_compat$TensorFlow Version Compatibility},
+    which explains backward compatibility guarantees and non-guarantees.
+  * @{$programmers_guide/faq$FAQ}, which contains frequently asked
+    questions about TensorFlow. (We have not revised this document for v1.3,
+    except to remove some obsolete information.)
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 2a58c4647d1..5082e7f36c8 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,15 +1,13 @@
 index.md
 tensors.md
 variables.md
-dims_types.md
 graphs.md
+estimators.md
+saved_model.md
 datasets.md
 threading_and_queues.md
-reading_data.md
 embedding.md
 debugger.md
 supervisor.md
-saved_model.md
-meta_graph.md
 version_compat.md
 faq.md

From 711be6adcffde0688e3bf04b791b517a28fc5045 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 17 Aug 2017 20:21:45 -0700
Subject: [PATCH 14/70] `Dataset.from_generator()` constructs a dataset from a
 Python generator.

With this change, it becomes possible to use a Python generator as the source
dataset for a `tf.contrib.data` input pipeline. This enables easier integration
with non-TensorFlow data sources. The generator can yield a nested structure of
NumPy arrays, or values convertible to NumPy arrays.

This addresses a concern raised in issue #7951.

PiperOrigin-RevId: 165663857
---
 .../dataset_constructor_op_test.py            | 210 ++++++++++++++++++
 tensorflow/contrib/data/python/ops/BUILD      |   1 +
 .../contrib/data/python/ops/dataset_ops.py    | 170 ++++++++++++++
 tensorflow/core/kernels/map_dataset_op.cc     |  12 +-
 4 files changed, 391 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
index 6a7bc99fa88..1de2f8e4da5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
+
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import dataset_ops
@@ -255,6 +257,214 @@ class DatasetConstructorTest(test.TestCase):
     self.assertEquals(dtypes.int64, get_next.dtype)
     self.assertEquals([3], get_next.shape)
 
+  def _testFromGenerator(self, generator, elem_sequence, num_repeats):
+    iterator = (
+        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
+        .repeat(num_repeats)
+        .prefetch(5)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      for _ in range(2):  # Run twice to test reinitialization.
+        sess.run(init_op)
+        for _ in range(num_repeats):
+          for elem in elem_sequence:
+            self.assertAllEqual(elem, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
+    iterator = (
+        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
+        .repeat(num_repeats)
+        .prefetch(5)
+        .make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      for _ in range(num_repeats):
+        for elem in elem_sequence:
+          self.assertAllEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromGeneratorUsingFunction(self):
+    def generator():
+      for i in range(1, 100):
+        yield [i] * i
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1)
+    self._testFromGenerator(generator, elem_sequence, 5)
+    self._testFromGeneratorOneShot(generator, elem_sequence, 1)
+    self._testFromGeneratorOneShot(generator, elem_sequence, 5)
+
+  def testFromGeneratorUsingList(self):
+    generator = lambda: [[i] * i for i in range(1, 100)]
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1)
+    self._testFromGenerator(generator, elem_sequence, 5)
+
+  def testFromGeneratorUsingNdarray(self):
+    generator = lambda: np.arange(100, dtype=np.int64)
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1)
+    self._testFromGenerator(generator, elem_sequence, 5)
+
+  def testFromGeneratorUsingGeneratorExpression(self):
+    # NOTE(mrry): Generator *expressions* are not repeatable (or in
+    # general reusable), because they eagerly evaluate the `for`
+    # expression as `iter(range(1, 100))` and discard the means of
+    # reconstructing `range(1, 100)`. Wrapping the generator
+    # expression in a `lambda` makes it repeatable.
+    generator = lambda: ([i] * i for i in range(1, 100))
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1)
+    self._testFromGenerator(generator, elem_sequence, 5)
+
+  def testFromMultipleConcurrentGenerators(self):
+    num_inner_repeats = 5
+    num_outer_repeats = 100
+
+    def generator():
+      for i in range(1, 10):
+        yield ([i] * i, [i, i ** 2, i ** 3])
+    input_list = list(generator())
+
+    # The interleave transformation is essentially a flat map that
+    # draws from multiple input datasets concurrently (in a cyclic
+    # fashion). By placing `Datsaet.from_generator()` inside an
+    # interleave, we test its behavior when multiple iterators are
+    # active at the same time; by additionally prefetching inside the
+    # interleave, we create the possibility of parallel (modulo GIL)
+    # invocations to several iterators created by the same dataset.
+    def interleave_fn(_):
+      return (dataset_ops.Dataset.from_generator(
+          generator, output_types=(dtypes.int64, dtypes.int64),
+          output_shapes=([None], [3]))
+              .repeat(num_inner_repeats).prefetch(5))
+
+    iterator = (
+        dataset_ops.Dataset.range(num_outer_repeats)
+        .interleave(interleave_fn, cycle_length=10,
+                    block_length=len(input_list))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(num_inner_repeats * num_outer_repeats):
+        for elem in input_list:
+          val0, val1 = sess.run(get_next)
+          self.assertAllEqual(elem[0], val0)
+          self.assertAllEqual(elem[1], val1)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromGeneratorsRunningInParallel(self):
+    num_parallel_iterators = 3
+
+    # Define shared state that multiple iterator instances will access to
+    # demonstrate their concurrent activity.
+    lock = threading.Lock()
+    condition = threading.Condition(lock)
+    next_ticket = [0]  # GUARDED_BY(lock)
+
+    def generator():
+      # NOTE(mrry): We yield one element before the barrier, because
+      # the current implementation of `Dataset.interleave()` must
+      # fetch one element from each incoming dataset to start the
+      # prefetching.
+      yield 0
+
+      # Define a barrier that `num_parallel_iterators` iterators must enter
+      # before any can proceed. Demonstrates that multiple iterators may be
+      # active at the same time.
+      condition.acquire()
+      ticket = next_ticket[0]
+      next_ticket[0] += 1
+      if ticket == num_parallel_iterators - 1:
+        # The last iterator to join the barrier notifies the others.
+        condition.notify_all()
+      else:
+        # Wait until the last iterator enters the barrier.
+        while next_ticket[0] < num_parallel_iterators:
+          condition.wait()
+      condition.release()
+
+      yield 1
+
+    # As in `testFromMultipleConcurrentGenerators()`, we use a combination of
+    # `Dataset.interleave()` and `Dataset.prefetch()` to cause multiple
+    # iterators to be active concurrently.
+    def interleave_fn(_):
+      return dataset_ops.Dataset.from_generator(
+          generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
+
+    iterator = (
+        dataset_ops.Dataset.range(num_parallel_iterators)
+        .interleave(
+            interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for elem in [0, 1]:
+        for _ in range(num_parallel_iterators):
+          self.assertAllEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromGeneratorTypeError(self):
+    def generator():
+      yield np.array([1, 2, 3], dtype=np.int64)
+      yield np.array([4, 5, 6], dtype=np.int64)
+      yield "ERROR"
+      yield np.array([7, 8, 9], dtype=np.int64)
+
+    iterator = (dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64, output_shapes=[3])
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      self.assertAllEqual([4, 5, 6], sess.run(get_next))
+      with self.assertRaisesOpError(r"element of type .*int64.* was expected"):
+        sess.run(get_next)
+      self.assertAllEqual([7, 8, 9], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromGeneratorShapeError(self):
+    def generator():
+      yield np.array([1, 2, 3], dtype=np.int64)
+      yield np.array([4, 5, 6], dtype=np.int64)
+      yield np.array([7, 8, 9, 10], dtype=np.int64)
+      yield np.array([11, 12, 13], dtype=np.int64)
+
+    iterator = (dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64, output_shapes=[3])
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      self.assertAllEqual([4, 5, 6], sess.run(get_next))
+      with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
+        sess.run(get_next)
+      self.assertAllEqual([11, 12, 13], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index f49350505ae..8afd122d82d 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -24,6 +24,7 @@ py_library(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 6ef960037f0..ed3359730c4 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import collections
+import threading
 import warnings
 
 import numpy as np
@@ -40,6 +42,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import gfile
 
 
@@ -559,6 +562,168 @@ class Dataset(object):
     """
     return SparseTensorSliceDataset(sparse_tensor)
 
+  class _GeneratorState(object):
+    """Stores outstanding iterators created from a Python generator.
+
+    This class keeps track of potentially multiple iterators that may have
+    been created from a generator, e.g. in the case that the dataset is
+    repeated, or nested within a parallel computation.
+    """
+
+    def __init__(self, generator):
+      self._generator = generator
+      self._lock = threading.Lock()
+      self._next_id = 0  # GUARDED_BY(self._lock)
+      self._iterators = collections.defaultdict(lambda: iter(generator()))
+
+    def get_next_id(self):
+      with self._lock:
+        ret = self._next_id
+        self._next_id += 1
+      return ret
+
+    def get_iterator(self, iterator_id):
+      return self._iterators[iterator_id]
+
+    def iterator_completed(self, iterator_id):
+      del self._iterators[iterator_id]
+
+  @staticmethod
+  def from_generator(generator, output_types, output_shapes=None):
+    """Creates a `Dataset` whose elements are generated by `generator`.
+
+    The `generator` argument must be a callable object that returns
+    an object that support the `iter()` protocol (e.g. a generator function).
+    The elements generated by `generator` must be compatible with the given
+    `output_types` and (optional) `output_shapes` arguments.
+
+    Args:
+      generator: A callable object that takes no arguments and returns an
+        object that supports the `iter()` protocol.
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element yielded by `generator`.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape`
+        objects corresponding to each component of an element yielded by
+        `generator`.
+
+    Returns:
+      A `Dataset`.
+    """
+    if not callable(generator):
+      raise TypeError("`generator` must be callable.")
+    if output_shapes is None:
+      output_shapes = nest.map_structure(
+          lambda _: tensor_shape.TensorShape(None), output_types)
+    else:
+      output_shapes = nest.map_structure_up_to(
+          output_types, tensor_shape.as_shape, output_shapes)
+
+    flattened_types = nest.flatten(output_types)
+    flattened_shapes = nest.flatten(output_shapes)
+
+    generator_state = Dataset._GeneratorState(generator)
+
+    def get_iterator_id_map_fn(unused_dummy):
+      """Creates a unique `iterator_id` for each pass over the dataset.
+
+      The "iterator_id" disambiguates between multiple concurrently
+      existing iterators.
+
+      Args:
+        unused_dummy: Ignored value.
+
+      Returns:
+        A `tf.int64` tensor whose value uniquely identifies an iterator in
+        `generator_state`.
+      """
+      return script_ops.py_func(
+          generator_state.get_next_id, [], dtypes.int64, stateful=True)
+
+    def generator_map_fn(iterator_id_t):
+      """Generates the next element from iterator with ID `iterator_id_t`.
+
+      We map this function across an infinite repetition of the
+      `iterator_id_t`, and raise `StopIteration` to terminate the iteration.
+
+      Args:
+        iterator_id_t: A `tf.int64` tensor whose value uniquely identifies
+          the iterator in `generator_state` from which to generate an element.
+
+      Returns:
+        A nested structure of tensors representing an element from the iterator.
+      """
+      def generator_py_func(iterator_id):
+        """A `py_func` that will be called to invoke the iterator."""
+        try:
+          values = next(generator_state.get_iterator(iterator_id))
+        except StopIteration:
+          generator_state.iterator_completed(iterator_id)
+          raise StopIteration("Iteration finished.")
+
+        # Use the same _convert function from the py_func() implementation to
+        # convert the returned values to arrays early, so that we can inspect
+        # their values.
+        # pylint: disable=protected-access
+        ret_arrays = [script_ops.FuncRegistry._convert(ret)
+                      for ret in nest.flatten_up_to(output_types, values)]
+        # pylint: enable=protected-access
+
+        # Additional type and shape checking to ensure that the components
+        # of the generated element match the `output_types` and `output_shapes`
+        # arguments.
+        for (ret_array, expected_dtype, expected_shape) in zip(
+            ret_arrays, flattened_types, flattened_shapes):
+          if ret_array.dtype != expected_dtype.as_numpy_dtype:
+            raise TypeError(
+                "`generator` yielded an element of type %s where an element "
+                "of type %s was expected."
+                % (ret_array.dtype, expected_dtype.as_numpy_dtype))
+          if not expected_shape.is_compatible_with(ret_array.shape):
+            raise ValueError(
+                "`generator` yielded an element of shape %s where an element "
+                "of shape %s was expected." % (ret_array.shape, expected_shape))
+
+        return ret_arrays
+
+      flat_values = script_ops.py_func(
+          generator_py_func, [iterator_id_t], flattened_types, stateful=True)
+
+      # The `py_func()` op drops the inferred shapes, so we add them back in
+      # here.
+      if output_shapes is not None:
+        for ret_t, shape in zip(flat_values, flattened_shapes):
+          ret_t.set_shape(shape)
+
+      return nest.pack_sequence_as(output_types, flat_values)
+
+    # This function associates each traversal of `generator` with a unique
+    # iterator ID.
+    def flat_map_fn(iterator_id_t):
+      # First, generate an infinite dataset containing the iterator ID repeated
+      # forever.
+      repeated_id = Dataset.from_tensors(iterator_id_t).repeat(None)
+
+      # The `generator_map_fn` gets the next element from the iterator with the
+      # relevant ID, and raises StopIteration when that iterator contains no
+      # more elements.
+      return repeated_id.map(generator_map_fn)
+
+    # A single-element dataset that, each time it is evaluated, contains a
+    # freshly-generated and unique (for the returned dataset) int64
+    # ID that will be used to identify the appropriate Python state, which
+    # is encapsulated in `generator_state`, and captured in
+    # `get_iterator_id_map_fn`.
+    dummy = 0
+    id_dataset = Dataset.from_tensors(dummy).map(get_iterator_id_map_fn)
+
+    # A dataset that contains all of the elements generated by a
+    # single iterator created from `generator`, identified by the
+    # iterator ID contained in `id_dataset`. Lifting the iteration
+    # into a flat_map here enables multiple repetitions and/or nested
+    # versions of the returned dataset to be created, because it forces
+    # the generation of a new ID for each version.
+    return id_dataset.flat_map(flat_map_fn)
+
   @staticmethod
   def range(*args):
     """Creates a `Dataset` of a step-separated range of values.
@@ -1123,6 +1288,11 @@ class Dataset(object):
     }
     ```
 
+    NOTE: The order of elements yielded by this transformation is
+    deterministic, as long as `map_func` is a pure function. If
+    `map_func` contains any stateful operations, the order in which
+    that state is accessed is undefined.
+
     Args:
       map_func: A function mapping a nested structure of tensors (having shapes
         and types defined by `self.output_shapes` and `self.output_types`) to a
diff --git a/tensorflow/core/kernels/map_dataset_op.cc b/tensorflow/core/kernels/map_dataset_op.cc
index 13a1ceaadff..bd6b0bce889 100644
--- a/tensorflow/core/kernels/map_dataset_op.cc
+++ b/tensorflow/core/kernels/map_dataset_op.cc
@@ -127,8 +127,16 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         opts.runner = ctx->runner();
         // TODO(mrry): Avoid blocking a threadpool thread. We will need to
         // stack-rip the iterators and use async kernels.
-        return dataset()->captured_func_->Run(opts, args, out_tensors,
-                                              prefix());
+        Status s =
+            dataset()->captured_func_->Run(opts, args, out_tensors, prefix());
+        if (errors::IsOutOfRange(s)) {
+          // `f` may deliberately raise `errors::OutOfRange` to indicate
+          // that we should terminate the iteration early.
+          *end_of_sequence = true;
+          return Status::OK();
+        } else {
+          return s;
+        }
       }
 
      private:

From 573b303ac8204d626bee266798e1eb3df0fed491 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 03:20:39 -0700
Subject: [PATCH 15/70] BUILD cleanup in tensorflow/core/kernels

PiperOrigin-RevId: 165688864
---
 tensorflow/core/kernels/BUILD | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 8d96999f3bb..d833ed9e38a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -145,9 +145,7 @@ cc_library(
         "concat_lib.h",
         "concat_lib_cpu.h",
     ],
-    deps = [
-        "//third_party/eigen3",
-    ],
+    deps = ["//third_party/eigen3"],
 )
 
 cc_library(
@@ -229,8 +227,11 @@ cc_library(
     hdrs = ["ops_testutil.h"],
     deps = [
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
     ],
@@ -251,9 +252,7 @@ cc_library(
 cc_library(
     name = "ops_util_hdrs",
     hdrs = ["ops_util.h"],
-    deps = [
-        "//third_party/eigen3",
-    ],
+    deps = ["//third_party/eigen3"],
 )
 
 cc_library(
@@ -402,6 +401,7 @@ cc_library(
         "split_lib.h",
     ],
     deps = [
+        "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
 )
@@ -411,6 +411,7 @@ cc_library(
     hdrs = ["typed_queue.h"],
     deps = [
         ":queue_base",
+        "//tensorflow/core:framework",
     ],
 )
 
@@ -461,6 +462,8 @@ cc_library(
     ],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
     ],
 )
@@ -488,6 +491,8 @@ cc_library(
     hdrs = ["image_resizer_state.h"],
     visibility = ["//visibility:private"],
     deps = [
+        ":bounds_check",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
@@ -799,6 +804,7 @@ tf_kernel_library(
         "tile_functor_gpu.cu.cc",
     ],
     prefix = "tile_ops",
+    textual_hdrs = ["tile_ops_gpu_impl.h"],
     deps = ARRAY_DEPS,
 )
 
@@ -1680,6 +1686,7 @@ cc_library(
         "conditional_accumulator_base_op.h",
     ],
     deps = [
+        ":conditional_accumulator_base",
         ":fill_functor",
         ":typed_conditional_accumulator_base",
     ],
@@ -3128,6 +3135,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//third_party/eigen3",
     ],
 )
 
@@ -3547,7 +3555,10 @@ cc_library(
         "smooth-hinge-loss.h",
         "squared-loss.h",
     ],
-    deps = ["//tensorflow/core:framework_headers_lib"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+    ],
 )
 
 cc_test(

From a6729325a3534ef4aeb2065be82bb2963b9b03de Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 18 Aug 2017 07:39:41 -0700
Subject: [PATCH 16/70] Deletes convert_n_to_eager_tensor. Moves
 convert_to_eager_tensor to constant_op.

PiperOrigin-RevId: 165704074
---
 tensorflow/python/eager/ops_test.py           |  4 +-
 .../python/eager/python_eager_op_gen.cc       |  4 +-
 tensorflow/python/eager/tensor.py             |  2 -
 tensorflow/python/framework/constant_op.py    | 23 +++++++--
 tensorflow/python/framework/ops.py            | 49 +++++--------------
 5 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index dee339f7f19..78ff2f67771 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -272,9 +272,7 @@ class TargetTest(test_util.TensorFlowTestCase):
 
   def testInvalidInputDataType(self):
     # Fill requires the first input to be an int32 tensor.
-    with self.assertRaisesRegexp(
-        TypeError,
-        'Expected tensor with type tf.int32 not tf.int64'):
+    with self.assertRaisesRegexp(ValueError, 'int64'):
       array_ops.fill(tensor.Tensor([2], dtype=dtypes.int64), tensor.Tensor(1))
 
   def testOutputOnHostMemory(self):
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 511ce82eeba..c46a3d8db37 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -624,8 +624,8 @@ void GenEagerPythonOp::AddEagerInputCasts() {
     const string fn = arg.number_attr().empty() ? "" : "n_";
     const string dtype =
         python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-    strings::StrAppend(&result_, "    ", param, " = _tensor.convert_", fn,
-                       "to_eager_tensor(", param, ", ", dtype, ")\n");
+    strings::StrAppend(&result_, "    ", param, " = _ops.convert_", fn,
+                       "to_tensor(", param, ", ", dtype, ")\n");
   }
 }
 
diff --git a/tensorflow/python/eager/tensor.py b/tensorflow/python/eager/tensor.py
index 1c2f4d74c7c..69269d1975f 100644
--- a/tensorflow/python/eager/tensor.py
+++ b/tensorflow/python/eager/tensor.py
@@ -24,8 +24,6 @@ import numpy as np
 # ops.py.
 # pylint: disable=unused-import
 from tensorflow.python.framework.ops import _tensor_from_handle
-from tensorflow.python.framework.ops import convert_n_to_eager_tensor
-from tensorflow.python.framework.ops import convert_to_eager_tensor
 from tensorflow.python.framework.ops import EagerTensor as Tensor
 # pylint: enable=unused-import
 
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index af3be7230c2..9de63607e12 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -41,6 +41,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from autograd import core as ag_core
 import numpy as np
 
 from tensorflow.core.framework import attr_value_pb2
@@ -66,13 +67,29 @@ def _eager_reshape(tensor, shape):
 def _eager_fill(dims, value):
   """Eager-only version of Fill op; requires value is an eager Tensor."""
   attr_t = value.dtype.as_datatype_enum
-  dims = ops.convert_to_eager_tensor(dims, dtypes.int32)
+  dims = convert_to_eager_tensor(dims, dtypes.int32)
   inputs_flat = [dims, value]
   attrs = ("T", attr_t)
   result, = execute.execute("Fill", 1, inputs=inputs_flat, attrs=attrs)
   return result
 
 
+def convert_to_eager_tensor(t, dtype=None):
+  """Converts the given `value` to an `EagerTensor`."""
+  if isinstance(ag_core.getval(t), ops.EagerTensor):
+    if dtype is not None and t.dtype != dtype:
+      raise TypeError("Expected tensor with type %r not %r" % (dtype, t.dtype))
+    return t
+  # Handle converting ResourceVariable to Tensor.
+  # TODO(josh11b): get rid of this explicit ugly conversion once we have a more
+  # general scheme in place.
+  try:
+    return t._dense_var_to_tensor(dtype=dtype, as_ref=False)  # pylint: disable=protected-access
+  except AttributeError:
+    pass
+  return ops.EagerTensor(t, dtype=dtype)
+
+
 def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
@@ -123,8 +140,8 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   """
   if not context.in_graph_mode():
     if shape is None:
-      return ops.convert_to_eager_tensor(value, dtype)
-    t = ops.convert_to_eager_tensor(value, dtype)
+      return convert_to_eager_tensor(value, dtype)
+    t = convert_to_eager_tensor(value, dtype)
     shape = tensor_shape.as_shape(shape)
     if shape == t.shape:
       return t
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 862dd706f41..6f1954537ec 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -876,29 +876,6 @@ class EagerTensor(Tensor):
     raise NotImplementedError("eval not supported for Eager Tensors.")
 
 
-# TODO(josh11b): Support other cases like converting TensorShape, lists/tuples and
-# other custom conversion functions.
-def convert_to_eager_tensor(t, dtype=None):
-  """Converts the given `value` to an `EagerTensor`."""
-  if isinstance(ag_core.getval(t), EagerTensor):
-    if dtype is not None and t.dtype != dtype:
-      raise TypeError("Expected tensor with type %r not %r" % (dtype, t.dtype))
-    return t
-  # Handle converting ResourceVariable to Tensor.
-  # TODO(josh11b): get rid of this explicit ugly conversion once we have a more
-  # general scheme in place.
-  try:
-    return t._dense_var_to_tensor(dtype=dtype, as_ref=False)  # pylint: disable=protected-access
-  except AttributeError:
-    pass
-  return EagerTensor(t, dtype=dtype)
-
-
-def convert_n_to_eager_tensor(values, dtype):
-  """Converts the given `values` to a list of `EagerTensor`."""
-  return [convert_to_eager_tensor(t, dtype) for t in values]
-
-
 def _tensor_from_handle(handle):
   """'Private' constructor for the Tensor object.
 
@@ -1112,21 +1089,17 @@ def internal_convert_n_to_tensor(values,
   """
   if not isinstance(values, collections.Sequence):
     raise TypeError("values must be a list.")
-  if context.in_graph_mode():
-    ret = []
-    for i, value in enumerate(values):
-      n = None if name is None else "%s_%d" % (name, i)
-      ret.append(
-          internal_convert_to_tensor(
-              value,
-              dtype=dtype,
-              name=n,
-              as_ref=as_ref,
-              preferred_dtype=preferred_dtype))
-    return ret
-  else:
-    # TODO(josh11b): handle preferred_dtype, as_ref
-    return convert_n_to_eager_tensor(values, dtype=dtype)
+  ret = []
+  for i, value in enumerate(values):
+    n = None if name is None else "%s_%d" % (name, i)
+    ret.append(
+        internal_convert_to_tensor(
+            value,
+            dtype=dtype,
+            name=n,
+            as_ref=as_ref,
+            preferred_dtype=preferred_dtype))
+  return ret
 
 
 def convert_n_to_tensor(values, dtype=None, name=None, preferred_dtype=None):

From 7d01f89cc3a05fbd4d79dd5713b9856a8e2764e1 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Fri, 18 Aug 2017 09:32:30 -0700
Subject: [PATCH 17/70] Android demo app for speech recognition

PiperOrigin-RevId: 165714459
---
 WORKSPACE                                     |  10 +
 tensorflow/contrib/makefile/Makefile          |  19 +-
 .../contrib/makefile/download_dependencies.sh |   2 +
 tensorflow/contrib/makefile/tf_op_files.txt   |   8 +
 tensorflow/core/BUILD                         |   2 +
 tensorflow/core/kernels/BUILD                 |  14 +
 .../docs_src/tutorials/audio_recognition.md   |  47 ++-
 .../examples/android/AndroidManifest.xml      |  10 +
 tensorflow/examples/android/BUILD             |   1 +
 tensorflow/examples/android/README.md         | 116 +++---
 .../examples/android/download-models.gradle   |   3 +-
 .../examples/android/res/drawable/border.xml  |  19 +
 .../android/res/layout/activity_speech.xml    |  55 +++
 .../android/res/layout/list_text_item.xml     |  25 ++
 .../android/res/values/base-strings.xml       |   1 +
 .../tensorflow/demo/RecognizeCommands.java    | 186 +++++++++
 .../org/tensorflow/demo/SpeechActivity.java   | 353 ++++++++++++++++++
 17 files changed, 807 insertions(+), 64 deletions(-)
 create mode 100644 tensorflow/examples/android/res/drawable/border.xml
 create mode 100644 tensorflow/examples/android/res/layout/activity_speech.xml
 create mode 100644 tensorflow/examples/android/res/layout/list_text_item.xml
 create mode 100644 tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
 create mode 100644 tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java

diff --git a/WORKSPACE b/WORKSPACE
index 959587387ee..5e9b991fcca 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -80,3 +80,13 @@ new_http_archive(
         "http://download.tensorflow.org/models/stylize_v1.zip",
     ],
 )
+
+new_http_archive(
+    name = "speech_commands",
+    build_file = "models.BUILD",
+    sha256 = "c3ec4fea3158eb111f1d932336351edfe8bd515bb6e87aad4f25dbad0a600d0c",
+    urls = [
+        "http://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
+        "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
+    ],
+)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index a4f7453ed5c..f8837e3f586 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -73,8 +73,9 @@ HOST_INCLUDES := \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/eigen \
-  -I$(MAKEFILE_DIR)/downloads/gemmlowp \
+-I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
+-I$(MAKEFILE_DIR)/downloads/fft2d \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
@@ -156,6 +157,7 @@ INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/eigen \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
+-I$(MAKEFILE_DIR)/downloads/fft2d \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
@@ -237,6 +239,7 @@ ifeq ($(TARGET),ANDROID)
     $(error "NDK_ROOT is not defined.")
 	endif
 	CXX := $(CC_PREFIX) $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/$(OS_PATH)-x86_64/bin/arm-linux-androideabi-g++
+	CC := $(CC_PREFIX) $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/$(OS_PATH)-x86_64/bin/arm-linux-androideabi-gcc
 	CXXFLAGS +=\
 --sysroot $(NDK_ROOT)/platforms/android-21/arch-arm \
 -Wno-narrowing \
@@ -244,7 +247,6 @@ ifeq ($(TARGET),ANDROID)
 -mfloat-abi=softfp \
 -mfpu=neon \
 -fPIE
-
 	INCLUDES = \
 -I$(NDK_ROOT)/sources/android/support/include \
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/include \
@@ -254,6 +256,7 @@ ifeq ($(TARGET),ANDROID)
 -I$(MAKEFILE_DIR)/downloads/eigen \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
+-I$(MAKEFILE_DIR)/downloads/fft2d \
 -I$(MAKEFILE_DIR)/gen/protobuf/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -507,6 +510,7 @@ $(wildcard tensorflow/core/grappler/clusters/single_machine.*)
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # Add in any extra files that don't fit the patterns easily
 TF_CC_SRCS += tensorflow/core/platform/default/gpu_tracer.cc
+TF_CC_SRCS += tensorflow/contrib/makefile/downloads/fft2d/fftsg.c
 # Also include the op and kernel definitions.
 TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
 PBT_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_pb_text_files.txt)
@@ -529,7 +533,8 @@ tensorflow/core/kernels/hexagon/hexagon_remote_fused_graph_executor_build.cc
 endif
 
 # File names of the intermediate files target compilation generates.
-TF_CC_OBJS := $(addprefix $(OBJDIR), $(TF_CC_SRCS:.cc=.o))
+TF_CC_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TF_CC_SRCS))))
 PBT_GEN_FILES := $(addprefix $(PBTGENDIR), $(PBT_CC_SRCS))
 PBT_OBJS := $(addprefix $(OBJDIR), $(PBT_CC_SRCS:.cc=.o))
 PROTO_CC_SRCS := $(addprefix $(PROTOGENDIR), $(PROTO_SRCS:.proto=.pb.cc))
@@ -567,6 +572,14 @@ $(OBJDIR)%.o: %.cc | $(PBT_GEN_FILES)
 	$(CXX) $(CXXFLAGS) $(DEPFLAGS) $(INCLUDES) -c $< -o $@
 	@mv -f $(DEPDIR)/$*.Td $(DEPDIR)/$*.d
 
+# Matches on plain C files.
+$(OBJDIR)%.o: %.c
+	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $(DEPDIR)$*)
+	$(CXX) $(patsubst --std=c++11,--std=c99, $(CXXFLAGS)) -x c $(DEPFLAGS) \
+$(INCLUDES) -c $< -o $@
+	@mv -f $(DEPDIR)/$*.Td $(DEPDIR)/$*.d
+
 # Compiles C++ source files that have been generated by protoc.
 $(OBJDIR)%.pb.o: $(PROTOGENDIR)%.pb.cc
 	@mkdir -p $(dir $@)
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index bb30a3b5a7b..1e9958584c9 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -25,6 +25,7 @@ GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.g
 NSYNC_URL="$(grep -o 'http.*github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
 PROTOBUF_URL="$(grep -o 'http.*github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
 RE2_URL="$(grep -o 'http.*github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -60,6 +61,7 @@ download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
 download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
 download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
+download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 9132a4344bf..a7f2be9790d 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -38,6 +38,8 @@ tensorflow/core/kernels/stack_ops.cc
 tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_v_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
+tensorflow/core/kernels/spectrogram_op.cc
+tensorflow/core/kernels/spectrogram.cc
 tensorflow/core/kernels/sparse_to_dense_op.cc
 tensorflow/core/kernels/sparse_matmul_op.cc
 tensorflow/core/kernels/softsign_op.cc
@@ -100,6 +102,10 @@ tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
+tensorflow/core/kernels/mfcc_op.cc
+tensorflow/core/kernels/mfcc_mel_filterbank.cc
+tensorflow/core/kernels/mfcc_dct.cc
+tensorflow/core/kernels/mfcc.cc
 tensorflow/core/kernels/maxpooling_op.cc
 tensorflow/core/kernels/matmul_op.cc
 tensorflow/core/kernels/lrn_op.cc
@@ -117,6 +123,7 @@ tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/fifo_queue.cc
 tensorflow/core/kernels/fake_quant_ops.cc
 tensorflow/core/kernels/example_parsing_ops.cc
+tensorflow/core/kernels/encode_wav_op.cc
 tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
 tensorflow/core/kernels/decode_bmp_op.cc
@@ -124,6 +131,7 @@ tensorflow/core/kernels/depthtospace_op.cc
 tensorflow/core/kernels/spacetodepth_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
+tensorflow/core/kernels/decode_wav_op.cc
 tensorflow/core/kernels/xsmm_conv2d.cc
 tensorflow/core/kernels/cwise_ops_common.cc
 tensorflow/core/kernels/cwise_op_tanh.cc
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index f7b79e82e16..54f2ff7e132 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -981,6 +981,8 @@ cc_library(
     deps = [
         ":protos_cc",
         "//third_party/eigen3",
+        "//third_party/fft2d:fft2d_headers",
+        "@fft2d//:fft2d",
         "@gemmlowp//:gemmlowp",
         "@nsync//:nsync_cpp",
     ],
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index d833ed9e38a..9f638eebee4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4322,6 +4322,9 @@ filegroup(
         "gemm_functors.h",
         "image_resizer_state.h",
         "maxpooling_op.h",
+        "mfcc.h",
+        "mfcc_dct.h",
+        "mfcc_mel_filterbank.h",
         "mirror_pad_op.h",
         "mirror_pad_op_cpu_impl.h",
         "pad_op.h",
@@ -4338,6 +4341,7 @@ filegroup(
         "softsign_op.h",
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
+        "spectrogram.h",
         "tensor_array.h",
         "tile_functor.h",
         "tile_ops_cpu_impl.h",
@@ -4411,10 +4415,12 @@ filegroup(
         "cwise_op_squared_difference.cc",
         "cwise_op_sub.cc",
         "cwise_op_tanh.cc",
+        "decode_wav_op.cc",
         "deep_conv2d.cc",
         "deep_conv2d.h",
         "depthwise_conv_op.cc",
         "dynamic_partition_op.cc",
+        "encode_wav_op.cc",
         "fake_quant_ops.cc",
         "fifo_queue.cc",
         "fused_batch_norm_op.cc",
@@ -4443,6 +4449,10 @@ filegroup(
         "logging_ops.cc",
         "lrn_op.cc",
         "maxpooling_op.cc",
+        "mfcc.cc",
+        "mfcc_dct.cc",
+        "mfcc_mel_filterbank.cc",
+        "mfcc_op.cc",
         "mirror_pad_op.cc",
         "mirror_pad_op_cpu_impl_1.cc",
         "mirror_pad_op_cpu_impl_2.cc",
@@ -4478,6 +4488,8 @@ filegroup(
         "spacetobatch_op.cc",
         "spacetodepth_op.cc",
         "sparse_to_dense_op.cc",
+        "spectrogram.cc",
+        "spectrogram_op.cc",
         "stack_ops.cc",
         "string_join_op.cc",
         "summary_op.cc",
@@ -4614,6 +4626,8 @@ cc_library(
         "//tensorflow/core:android_tensorflow_lib_lite",
         "//tensorflow/core:protos_cc",
         "//third_party/eigen3",
+        "//third_party/fft2d:fft2d_headers",
+        "@fft2d//:fft2d",
         "@gemmlowp//:gemmlowp",
     ],
     alwayslink = 1,
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 57d3ebb9968..2caa3ec0d2d 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -214,6 +214,41 @@ of the other .wav files in that same folder to see how well it does.
 The scores are between zero and one, and higher values mean the model is more
 confident in its prediction.
 
+## Running the Model in an Android App
+
+The easiest way to see how this model works in a real application is to download
+[the prebuilt Android demo
+applications](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#prebuilt-components)
+and install them on your phone. You'll see 'TF Speech' appear in your app list,
+and opening it will show you the same list of action words we've just trained
+our model on, starting with "Yes" and "No". Once you've given the app permission
+to use the microphone, you should be able to try saying those words and see them
+highlighted in the UI when the model recognizes one of them.
+
+You can also build this application yourself, since it's open source and
+[available as part of the TensorFlow repository on
+github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#building-in-android-studio-using-the-tensorflow-aar-from-jcenter).
+By default it downloads [a pretrained model from
+tensorflow.org](http://download.tensorflow.org/models/speech_commands_v0.01.zip),
+but you can easily [replace it with a model you've trained
+yourself](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-model-files-optional).
+If you do this, you'll need to make sure that the constants in [the main
+SpeechActivity Java source
+file](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java)
+like `SAMPLE_RATE` and `SAMPLE_DURATION` match any changes you've made to the
+defaults while training. You'll also see that there's a [Java version of the
+RecognizeCommands
+module](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java)
+that's very similar to the C++ version in this tutorial. If you've tweaked
+parameters for that, you can also update them in SpeechActivity to get the same
+results as in your server testing.
+
+The demo app updates its UI list of results automatically based on the labels
+text file you copy into assets alongside your frozen graph, which means you can
+easily try out different models without needing to make any code changes. You
+will need to updaye `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
+you've added if you change the paths though.
+
 ## How does this Model Work?
 
 The architecture used in this tutorial is based on some described in the paper
@@ -341,13 +376,14 @@ aren't detected (high precision). The numbers from the tool give you an idea of
 how your model will perform in an application, and you can try tweaking the
 signal averaging parameters to tune it to give the kind of performance you want.
 To understand what the right parameters are for your application, you can look
-at generating an [ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
-to help you understand the tradeoffs.
+at generating an [ROC
+curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) to help
+you understand the tradeoffs.
 
 ## RecognizeCommands
 
-The streaming accuracy tool uses a simple decoder contained in a small
-C++ class called
+The streaming accuracy tool uses a simple decoder contained in a small C++ class
+called
 [RecognizeCommands](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands/recognize_commands.h).
 This class is fed the output of running the TensorFlow model over time, it
 averages the signals, and returns information about a label when it has enough
@@ -480,7 +516,8 @@ variations in starting time in the training data, and is controlled with the
 `--time_shift_ms` flag, which defaults to 100ms. Increasing this value will
 provide more variation, but at the risk of cutting off important parts of the
 audio. A related way of augmenting the data with realistic distortions is by
-using [time stretching and pitch scaling](https://en.wikipedia.org/wiki/Audio_time_stretching_and_pitch_scaling),
+using [time stretching and pitch
+scaling](https://en.wikipedia.org/wiki/Audio_time_stretching_and_pitch_scaling),
 but that's outside the scope of this tutorial.
 
 ## Customizing the Model
diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/examples/android/AndroidManifest.xml
index 9f229d8b9d4..bb75431a1f8 100644
--- a/tensorflow/examples/android/AndroidManifest.xml
+++ b/tensorflow/examples/android/AndroidManifest.xml
@@ -22,6 +22,7 @@
     <uses-feature android:name="android.hardware.camera" />
     <uses-feature android:name="android.hardware.camera.autofocus" />
     <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
 
     <uses-sdk
         android:minSdkVersion="21"
@@ -59,6 +60,15 @@
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
+
+        <activity android:name="org.tensorflow.demo.SpeechActivity"
+            android:screenOrientation="portrait"
+            android:label="@string/activity_name_speech">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
     </application>
 
 </manifest>
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 2d3b0911fce..2347e6b0231 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -93,6 +93,7 @@ filegroup(
     srcs = [
         "@inception5h//:model_files",
         "@mobile_ssd//:model_files",
+        "@speech_commands//:model_files",
         "@stylize//:model_files",
     ],
 )
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index f9881287cdf..883f8e664fd 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -8,10 +8,11 @@ devices.
 The demos in this folder are designed to give straightforward samples of using
 TensorFlow in mobile applications.
 
-Inference is done using the [TensorFlow Android Inference Interface](../../../tensorflow/contrib/android),
-which may be built separately if you want a standalone library to drop into your
-existing application. Object tracking and efficient YUV -> RGB conversion are
-handled by `libtensorflow_demo.so`.
+Inference is done using the [TensorFlow Android Inference
+Interface](../../../tensorflow/contrib/android), which may be built separately
+if you want a standalone library to drop into your existing application. Object
+tracking and efficient YUV -> RGB conversion are handled by
+`libtensorflow_demo.so`.
 
 A device running Android 5.0 (API 21) or higher is required to run the demo due
 to the use of the camera2 API, although the native libraries themselves can run
@@ -33,6 +34,12 @@ on API >= 14 devices.
         Uses a model based on [A Learned Representation For Artistic
         Style](https://arxiv.org/abs/1610.07629) to restyle the camera preview
         image to that of a number of different artists.
+4.  [TF
+    Speech](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java):
+    Runs a simple speech recognition model built by the [audio training
+    tutorial](https://www.tensorflow.org/tutorials/image_retraining). Listens
+    for a small set of words, and highlights them in the UI when they are
+    recognized.
 
 <img src="sample_images/classify1.jpg" width="30%"><img src="sample_images/stylize1.jpg" width="30%"><img src="sample_images/detect1.jpg" width="30%">
 
@@ -51,20 +58,22 @@ for more details.
 
 ## Running the Demo
 
-Once the app is installed it can be started via the "TF Classify", "TF Detect"
-and "TF Stylize" icons, which have the orange TensorFlow logo as their icon.
+Once the app is installed it can be started via the "TF Classify", "TF Detect",
+"TF Stylize", and "TF Speech" icons, which have the orange TensorFlow logo as
+their icon.
 
 While running the activities, pressing the volume keys on your device will
-toggle debug visualizations on/off, rendering additional info to the screen
-that may be useful for development purposes.
+toggle debug visualizations on/off, rendering additional info to the screen that
+may be useful for development purposes.
 
 ## Building in Android Studio using the TensorFlow AAR from JCenter
 
 The simplest way to compile the demo app yourself, and try out changes to the
-project code is to use AndroidStudio. Simply set this `android` directory as the project root.
+project code is to use AndroidStudio. Simply set this `android` directory as the
+project root.
 
-Then edit the `build.gradle` file and change the value of `nativeBuildSystem`
-to `'none'` so that the project is built in the simplest way possible:
+Then edit the `build.gradle` file and change the value of `nativeBuildSystem` to
+`'none'` so that the project is built in the simplest way possible:
 
 ```None
 def nativeBuildSystem = 'none'
@@ -77,8 +86,8 @@ Note: Currently, in this build mode, YUV -> RGB is done using a less efficient
 Java implementation, and object tracking is not available in the "TF Detect"
 activity. Setting the build system to `'cmake'` currently only builds
 `libtensorflow_demo.so`, which provides fast YUV -> RGB conversion and object
-tracking, while still acquiring TensorFlow support via the downloaded AAR, so
-it may be a lightweight way to enable these features.
+tracking, while still acquiring TensorFlow support via the downloaded AAR, so it
+may be a lightweight way to enable these features.
 
 For any project that does not include custom low level TensorFlow code, this is
 likely sufficient.
@@ -104,50 +113,51 @@ protobuf compilation.
 
 NOTE: Bazel does not currently support building for Android on Windows. Full
 support for gradle/cmake builds is coming soon, but in the meantime we suggest
-that Windows users download the
-[prebuilt binaries](https://ci.tensorflow.org/view/Nightly/job/nightly-android/)
-instead.
+that Windows users download the [prebuilt
+binaries](https://ci.tensorflow.org/view/Nightly/job/nightly-android/) instead.
 
 ##### Install Bazel and Android Prerequisites
 
-Bazel is the primary build system for TensorFlow. To build with Bazel,
-it and the Android NDK and SDK must be installed on your system.
+Bazel is the primary build system for TensorFlow. To build with Bazel, it and
+the Android NDK and SDK must be installed on your system.
 
-1. Install the latest version of Bazel as per the instructions [on the Bazel website](https://bazel.build/versions/master/docs/install.html).
-2. The Android NDK is required to build the native (C/C++) TensorFlow code.
-        The current recommended version is 12b, which may be found
-        [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-12b-downloads).
-3. The Android SDK and build tools may be obtained
-        [here](https://developer.android.com/tools/revisions/build-tools.html),
-        or alternatively as part of
-        [Android Studio](https://developer.android.com/studio/index.html). Build
-        tools API >= 23 is required to build the TF Android demo (though it will
-        run on API >= 21 devices).
+1.  Install the latest version of Bazel as per the instructions [on the Bazel
+    website](https://bazel.build/versions/master/docs/install.html).
+2.  The Android NDK is required to build the native (C/C++) TensorFlow code. The
+    current recommended version is 12b, which may be found
+    [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-12b-downloads).
+3.  The Android SDK and build tools may be obtained
+    [here](https://developer.android.com/tools/revisions/build-tools.html), or
+    alternatively as part of [Android
+    Studio](https://developer.android.com/studio/index.html). Build tools API >=
+    23 is required to build the TF Android demo (though it will run on API >= 21
+    devices).
 
 ##### Edit WORKSPACE
 
-The Android entries in [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L36)
-must be uncommented with the paths filled in appropriately depending on where
-you installed the NDK and SDK. Otherwise an error such as:
-"The external label '//external:android/sdk' is not bound to anything" will
-be reported.
+The Android entries in
+[`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L36) must be uncommented
+with the paths filled in appropriately depending on where you installed the NDK
+and SDK. Otherwise an error such as: "The external label
+'//external:android/sdk' is not bound to anything" will be reported.
 
-Also edit the API levels for the SDK in WORKSPACE to the highest level you
-have installed in your SDK. This must be >= 23 (this is completely independent
-of the API level of the demo, which is defined in AndroidManifest.xml).
-The NDK API level may remain at 14.
+Also edit the API levels for the SDK in WORKSPACE to the highest level you have
+installed in your SDK. This must be >= 23 (this is completely independent of the
+API level of the demo, which is defined in AndroidManifest.xml). The NDK API
+level may remain at 14.
 
 ##### Install Model Files (optional)
 
-The TensorFlow `GraphDef`s that contain the model definitions and weights
-are not packaged in the repo because of their size. They are downloaded
+The TensorFlow `GraphDef`s that contain the model definitions and weights are
+not packaged in the repo because of their size. They are downloaded
 automatically and packaged with the APK by Bazel via a new_http_archive defined
-in `WORKSPACE` during the build process, and by Gradle via download-models.gradle.
+in `WORKSPACE` during the build process, and by Gradle via
+download-models.gradle.
 
-**Optional**: If you wish to place the models in your assets manually,
-remove all of the `model_files` entries from the `assets`
-list in `tensorflow_demo` found in the `[BUILD](BUILD)` file. Then download
-and extract the archives yourself to the `assets` directory in the source tree:
+**Optional**: If you wish to place the models in your assets manually, remove
+all of the `model_files` entries from the `assets` list in `tensorflow_demo`
+found in the `[BUILD](BUILD)` file. Then download and extract the archives
+yourself to the `assets` directory in the source tree:
 
 ```bash
 BASE_URL=https://storage.googleapis.com/download.tensorflow.org/models
@@ -162,27 +172,23 @@ This will extract the models and their associated metadata files to the local
 assets/ directory.
 
 If you are using Gradle, make sure to remove download-models.gradle reference
-from build.gradle after your manually download models; otherwise gradle
-might download models again and overwrite your models.
+from build.gradle after your manually download models; otherwise gradle might
+download models again and overwrite your models.
 
 ##### Build
 
-After editing your WORKSPACE file to update the SDK/NDK configuration,
-you may build the APK. Run this from your workspace root:
+After editing your WORKSPACE file to update the SDK/NDK configuration, you may
+build the APK. Run this from your workspace root:
 
 ```bash
 bazel build -c opt //tensorflow/examples/android:tensorflow_demo
 ```
 
-If you get build errors about protocol buffers, run
-`git submodule update --init` and make sure that you've modified your WORKSPACE
-file as instructed, then try building again.
-
 ##### Install
 
-Make sure that adb debugging is enabled on your Android 5.0 (API 21) or
-later device, then after building use the following command from your workspace
-root to install the APK:
+Make sure that adb debugging is enabled on your Android 5.0 (API 21) or later
+device, then after building use the following command from your workspace root
+to install the APK:
 
 ```bash
 adb install -r bazel-bin/tensorflow/examples/android/tensorflow_demo.apk
diff --git a/tensorflow/examples/android/download-models.gradle b/tensorflow/examples/android/download-models.gradle
index a19ca36d7f6..0e2cf65f538 100644
--- a/tensorflow/examples/android/download-models.gradle
+++ b/tensorflow/examples/android/download-models.gradle
@@ -11,7 +11,8 @@
 // LINT.IfChange
 def models = ['inception5h.zip',
               'object_detection/ssd_mobilenet_v1_android_export.zip',
-              'stylize_v1.zip']
+              'stylize_v1.zip',
+              'speech_commands_conv_actions.zip']
 // LINT.ThenChange(//tensorflow/examples/android/BUILD)
 
 // Root URL for model archives
diff --git a/tensorflow/examples/android/res/drawable/border.xml b/tensorflow/examples/android/res/drawable/border.xml
new file mode 100644
index 00000000000..dd1d64d1d61
--- /dev/null
+++ b/tensorflow/examples/android/res/drawable/border.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle" >
+  <solid android:color="#00000000" />
+  <stroke android:width="1dip" android:color="#cccccc" />
+</shape>
diff --git a/tensorflow/examples/android/res/layout/activity_speech.xml b/tensorflow/examples/android/res/layout/activity_speech.xml
new file mode 100644
index 00000000000..2fe1338da57
--- /dev/null
+++ b/tensorflow/examples/android/res/layout/activity_speech.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context="org.tensorflow.demo.SpeechActivity">
+
+    <TextView
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Say one of the words below!"
+        android:id="@+id/textView"
+        android:textAlignment="center"
+        android:layout_gravity="top"
+        android:textSize="24dp"
+        android:layout_marginTop="10dp"
+        android:layout_marginLeft="10dp"
+        />
+
+    <ListView
+        android:id="@+id/list_view"
+        android:layout_width="240dp"
+        android:layout_height="wrap_content"
+        android:background="@drawable/border"
+        android:layout_gravity="top|center_horizontal"
+        android:textAlignment="center"
+        android:layout_marginTop="100dp"
+        />
+
+    <Button
+        android:id="@+id/quit"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Quit"
+        android:layout_gravity="bottom|center_horizontal"
+        android:layout_marginBottom="10dp"
+        />
+
+</FrameLayout>
diff --git a/tensorflow/examples/android/res/layout/list_text_item.xml b/tensorflow/examples/android/res/layout/list_text_item.xml
new file mode 100644
index 00000000000..526017fbb24
--- /dev/null
+++ b/tensorflow/examples/android/res/layout/list_text_item.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<TextView
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/list_text_item"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:text="TextView"
+    android:textSize="24dp"
+    android:textAlignment="center"
+    android:gravity="center_horizontal"
+    />
diff --git a/tensorflow/examples/android/res/values/base-strings.xml b/tensorflow/examples/android/res/values/base-strings.xml
index 56edb55def7..81d144a135c 100644
--- a/tensorflow/examples/android/res/values/base-strings.xml
+++ b/tensorflow/examples/android/res/values/base-strings.xml
@@ -20,4 +20,5 @@
     <string name="activity_name_classification">TF Classify</string>
     <string name="activity_name_detection">TF Detect</string>
     <string name="activity_name_stylize">TF Stylize</string>
+    <string name="activity_name_speech">TF Speech</string>
 </resources>
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java b/tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
new file mode 100644
index 00000000000..9e91aea7efc
--- /dev/null
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.util.Log;
+import android.util.Pair;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.List;
+
+/** Reads in results from an instantaneous audio recognition model and smoothes them over time. */
+public class RecognizeCommands {
+  // Configuration settings.
+  private List<String> labels = new ArrayList<String>();
+  private long averageWindowDurationMs;
+  private float detectionThreshold;
+  private int suppressionMs;
+  private int minimumCount;
+  private long minimumTimeBetweenSamplesMs;
+
+  // Working variables.
+  private Deque<Pair<Long, float[]>> previousResults = new ArrayDeque<Pair<Long, float[]>>();
+  private String previousTopLabel;
+  private int labelsCount;
+  private long previousTopLabelTime;
+  private float previousTopLabelScore;
+
+  private static final String SILENCE_LABEL = "_silence_";
+  private static final long MINIMUM_TIME_FRACTION = 4;
+
+  public RecognizeCommands(
+      List<String> inLabels,
+      long inAverageWindowDurationMs,
+      float inDetectionThreshold,
+      int inSuppressionMS,
+      int inMinimumCount,
+      long inMinimumTimeBetweenSamplesMS) {
+    labels = inLabels;
+    averageWindowDurationMs = inAverageWindowDurationMs;
+    detectionThreshold = inDetectionThreshold;
+    suppressionMs = inSuppressionMS;
+    minimumCount = inMinimumCount;
+    labelsCount = inLabels.size();
+    previousTopLabel = SILENCE_LABEL;
+    previousTopLabelTime = Long.MIN_VALUE;
+    previousTopLabelScore = 0.0f;
+    minimumTimeBetweenSamplesMs = inMinimumTimeBetweenSamplesMS;
+  }
+
+  /** Holds information about what's been recognized. */
+  public static class RecognitionResult {
+    public final String foundCommand;
+    public final float score;
+    public final boolean isNewCommand;
+
+    public RecognitionResult(String inFoundCommand, float inScore, boolean inIsNewCommand) {
+      foundCommand = inFoundCommand;
+      score = inScore;
+      isNewCommand = inIsNewCommand;
+    }
+  }
+
+  private static class ScoreForSorting implements Comparable<ScoreForSorting> {
+    public final float score;
+    public final int index;
+
+    public ScoreForSorting(float inScore, int inIndex) {
+      score = inScore;
+      index = inIndex;
+    }
+
+    @Override
+    public int compareTo(ScoreForSorting other) {
+      if (this.score > other.score) {
+        return -1;
+      } else if (this.score < other.score) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  public RecognitionResult processLatestResults(float[] currentResults, long currentTimeMS) {
+    if (currentResults.length != labelsCount) {
+      throw new RuntimeException(
+          "The results for recognition should contain "
+              + labelsCount
+              + " elements, but there are "
+              + currentResults.length);
+    }
+
+    if ((!previousResults.isEmpty()) && (currentTimeMS < previousResults.getFirst().first)) {
+      throw new RuntimeException(
+          "You must feed results in increasing time order, but received a timestamp of "
+              + currentTimeMS
+              + " that was earlier than the previous one of "
+              + previousResults.getFirst().first);
+    }
+
+    final int howManyResults = previousResults.size();
+    // Ignore any results that are coming in too frequently.
+    if (howManyResults > 1) {
+      final long timeSinceMostRecent = currentTimeMS - previousResults.getLast().first;
+      if (timeSinceMostRecent < minimumTimeBetweenSamplesMs) {
+        return new RecognitionResult(previousTopLabel, previousTopLabelScore, false);
+      }
+    }
+
+    // Add the latest results to the head of the queue.
+    previousResults.addLast(new Pair<Long, float[]>(currentTimeMS, currentResults));
+
+    // Prune any earlier results that are too old for the averaging window.
+    final long timeLimit = currentTimeMS - averageWindowDurationMs;
+    while (previousResults.getFirst().first < timeLimit) {
+      previousResults.removeFirst();
+    }
+
+    // If there are too few results, assume the result will be unreliable and
+    // bail.
+    final long earliestTime = previousResults.getFirst().first;
+    final long samplesDuration = currentTimeMS - earliestTime;
+    if ((howManyResults < minimumCount)
+        || (samplesDuration < (averageWindowDurationMs / MINIMUM_TIME_FRACTION))) {
+      Log.v("RecognizeResult", "Too few results");
+      return new RecognitionResult(previousTopLabel, 0.0f, false);
+    }
+
+    // Calculate the average score across all the results in the window.
+    float[] averageScores = new float[labelsCount];
+    for (Pair<Long, float[]> previousResult : previousResults) {
+      final float[] scoresTensor = previousResult.second;
+      int i = 0;
+      while (i < scoresTensor.length) {
+        averageScores[i] += scoresTensor[i] / howManyResults;
+        ++i;
+      }
+    }
+
+    // Sort the averaged results in descending score order.
+    ScoreForSorting[] sortedAverageScores = new ScoreForSorting[labelsCount];
+    for (int i = 0; i < labelsCount; ++i) {
+      sortedAverageScores[i] = new ScoreForSorting(averageScores[i], i);
+    }
+    Arrays.sort(sortedAverageScores);
+
+    // See if the latest top score is enough to trigger a detection.
+    final int currentTopIndex = sortedAverageScores[0].index;
+    final String currentTopLabel = labels.get(currentTopIndex);
+    final float currentTopScore = sortedAverageScores[0].score;
+    // If we've recently had another label trigger, assume one that occurs too
+    // soon afterwards is a bad result.
+    long timeSinceLastTop;
+    if (previousTopLabel.equals(SILENCE_LABEL) || (previousTopLabelTime == Long.MIN_VALUE)) {
+      timeSinceLastTop = Long.MAX_VALUE;
+    } else {
+      timeSinceLastTop = currentTimeMS - previousTopLabelTime;
+    }
+    boolean isNewCommand;
+    if ((currentTopScore > detectionThreshold) && (timeSinceLastTop > suppressionMs)) {
+      previousTopLabel = currentTopLabel;
+      previousTopLabelTime = currentTimeMS;
+      previousTopLabelScore = currentTopScore;
+      isNewCommand = true;
+    } else {
+      isNewCommand = false;
+    }
+    return new RecognitionResult(currentTopLabel, currentTopScore, isNewCommand);
+  }
+}
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
new file mode 100644
index 00000000000..eb4dc69d63d
--- /dev/null
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
@@ -0,0 +1,353 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Demonstrates how to run an audio recognition model in Android.
+
+This example loads a simple speech recognition model trained by the tutorial at
+https://www.tensorflow.org/tutorials/audio_training
+
+The model files should be downloaded automatically from the TensorFlow website,
+but if you have a custom model you can update the LABEL_FILENAME and
+MODEL_FILENAME constants to point to your own files.
+
+The example application displays a list view with all of the known audio labels,
+and highlights each one when it thinks it has detected one through the
+microphone. The averaging of results to give a more reliable signal happens in
+the RecognizeCommands helper class.
+*/
+
+package org.tensorflow.demo;
+
+import android.animation.ValueAnimator;
+import android.app.Activity;
+import android.content.pm.PackageManager;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.View;
+import android.widget.ArrayAdapter;
+import android.widget.Button;
+import android.widget.ListView;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.locks.ReentrantLock;
+import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
+import org.tensorflow.demo.R;
+
+/**
+ * An activity that listens for audio and then uses a TensorFlow model to detect particular classes,
+ * by default a small set of action words.
+ */
+public class SpeechActivity extends Activity {
+
+  // Constants that control the behavior of the recognition code and model
+  // settings. See the audio recognition tutorial for a detailed explanation of
+  // all these, but you should customize them to match your training settings if
+  // you are running your own model.
+  private static final int SAMPLE_RATE = 16000;
+  private static final int SAMPLE_DURATION_MS = 1000;
+  private static final int RECORDING_LENGTH = (int) (SAMPLE_RATE * SAMPLE_DURATION_MS / 1000);
+  private static final long AVERAGE_WINDOW_DURATION_MS = 500;
+  private static final float DETECTION_THRESHOLD = 0.70f;
+  private static final int SUPPRESSION_MS = 1500;
+  private static final int MINIMUM_COUNT = 3;
+  private static final long MINIMUM_TIME_BETWEEN_SAMPLES_MS = 30;
+  private static final String LABEL_FILENAME = "file:///android_asset/conv_actions_labels.txt";
+  private static final String MODEL_FILENAME = "file:///android_asset/conv_actions_frozen.pb";
+  private static final String INPUT_DATA_NAME = "decoded_sample_data:0";
+  private static final String SAMPLE_RATE_NAME = "decoded_sample_data:1";
+  private static final String OUTPUT_SCORES_NAME = "labels_softmax";
+
+  // UI elements.
+  private static final int REQUEST_RECORD_AUDIO = 13;
+  private Button quitButton;
+  private ListView labelsListView;
+  private static final String LOG_TAG = SpeechActivity.class.getSimpleName();
+
+  // Working variables.
+  short[] recordingBuffer = new short[RECORDING_LENGTH];
+  int recordingOffset = 0;
+  boolean shouldContinue = true;
+  private Thread recordingThread;
+  boolean shouldContinueRecognition = true;
+  private Thread recognitionThread;
+  private final ReentrantLock recordingBufferLock = new ReentrantLock();
+  private TensorFlowInferenceInterface inferenceInterface;
+  private List<String> labels = new ArrayList<String>();
+  private List<String> displayedLabels = new ArrayList<>();
+  private RecognizeCommands recognizeCommands = null;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    // Set up the UI.
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_speech);
+    quitButton = (Button) findViewById(R.id.quit);
+    quitButton.setOnClickListener(
+        new View.OnClickListener() {
+          @Override
+          public void onClick(View view) {
+            moveTaskToBack(true);
+            android.os.Process.killProcess(android.os.Process.myPid());
+            System.exit(1);
+          }
+        });
+    labelsListView = (ListView) findViewById(R.id.list_view);
+
+    // Load the labels for the model, but only display those that don't start
+    // with an underscore.
+    String actualFilename = LABEL_FILENAME.split("file:///android_asset/")[1];
+    Log.i(LOG_TAG, "Reading labels from: " + actualFilename);
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(getAssets().open(actualFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        labels.add(line);
+        if (line.charAt(0) != '_') {
+          displayedLabels.add(line.substring(0, 1).toUpperCase() + line.substring(1));
+        }
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!", e);
+    }
+
+    // Build a list view based on these labels.
+    ArrayAdapter<String> arrayAdapter =
+        new ArrayAdapter<String>(this, R.layout.list_text_item, displayedLabels);
+    labelsListView.setAdapter(arrayAdapter);
+
+    // Set up an object to smooth recognition results to increase accuracy.
+    recognizeCommands =
+        new RecognizeCommands(
+            labels,
+            AVERAGE_WINDOW_DURATION_MS,
+            DETECTION_THRESHOLD,
+            SUPPRESSION_MS,
+            MINIMUM_COUNT,
+            MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+
+    // Load the TensorFlow model.
+    inferenceInterface = new TensorFlowInferenceInterface(getAssets(), MODEL_FILENAME);
+
+    // Start the recording and recognition threads.
+    requestMicrophonePermission();
+    startRecognition();
+  }
+
+  private void requestMicrophonePermission() {
+    requestPermissions(
+        new String[] {android.Manifest.permission.RECORD_AUDIO}, REQUEST_RECORD_AUDIO);
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      int requestCode, String[] permissions, int[] grantResults) {
+    if (requestCode == REQUEST_RECORD_AUDIO
+        && grantResults.length > 0
+        && grantResults[0] == PackageManager.PERMISSION_GRANTED) {
+      startRecording();
+      startRecognition();
+    }
+  }
+
+  public synchronized void startRecording() {
+    if (recordingThread != null) {
+      return;
+    }
+    shouldContinue = true;
+    recordingThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                record();
+              }
+            });
+    recordingThread.start();
+  }
+
+  public synchronized void stopRecording() {
+    if (recordingThread == null) {
+      return;
+    }
+    shouldContinue = false;
+    recordingThread = null;
+  }
+
+  private void record() {
+    android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_AUDIO);
+
+    // Estimate the buffer size we'll need for this device.
+    int bufferSize =
+        AudioRecord.getMinBufferSize(
+            SAMPLE_RATE, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT);
+    if (bufferSize == AudioRecord.ERROR || bufferSize == AudioRecord.ERROR_BAD_VALUE) {
+      bufferSize = SAMPLE_RATE * 2;
+    }
+    short[] audioBuffer = new short[bufferSize / 2];
+
+    AudioRecord record =
+        new AudioRecord(
+            MediaRecorder.AudioSource.DEFAULT,
+            SAMPLE_RATE,
+            AudioFormat.CHANNEL_IN_MONO,
+            AudioFormat.ENCODING_PCM_16BIT,
+            bufferSize);
+
+    if (record.getState() != AudioRecord.STATE_INITIALIZED) {
+      Log.e(LOG_TAG, "Audio Record can't initialize!");
+      return;
+    }
+
+    record.startRecording();
+
+    Log.v(LOG_TAG, "Start recording");
+
+    // Loop, gathering audio data and copying it to a round-robin buffer.
+    while (shouldContinue) {
+      int numberRead = record.read(audioBuffer, 0, audioBuffer.length);
+      int maxLength = recordingBuffer.length;
+      int newRecordingOffset = recordingOffset + numberRead;
+      int secondCopyLength = Math.max(0, newRecordingOffset - maxLength);
+      int firstCopyLength = numberRead - secondCopyLength;
+      // We store off all the data for the recognition thread to access. The ML
+      // thread will copy out of this buffer into its own, while holding the
+      // lock, so this should be thread safe.
+      recordingBufferLock.lock();
+      try {
+        System.arraycopy(audioBuffer, 0, recordingBuffer, recordingOffset, firstCopyLength);
+        System.arraycopy(audioBuffer, firstCopyLength, recordingBuffer, 0, secondCopyLength);
+        recordingOffset = newRecordingOffset % maxLength;
+      } finally {
+        recordingBufferLock.unlock();
+      }
+    }
+
+    record.stop();
+    record.release();
+  }
+
+  public synchronized void startRecognition() {
+    if (recognitionThread != null) {
+      return;
+    }
+    shouldContinueRecognition = true;
+    recognitionThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                recognize();
+              }
+            });
+    recognitionThread.start();
+  }
+
+  public synchronized void stopRecognition() {
+    if (recognitionThread == null) {
+      return;
+    }
+    shouldContinueRecognition = false;
+    recognitionThread = null;
+  }
+
+  private void recognize() {
+    Log.v(LOG_TAG, "Start recognition");
+
+    short[] inputBuffer = new short[RECORDING_LENGTH];
+    float[] floatInputBuffer = new float[RECORDING_LENGTH];
+    float[] outputScores = new float[labels.size()];
+    String[] outputScoresNames = new String[] {OUTPUT_SCORES_NAME};
+    int[] sampleRateList = new int[] {SAMPLE_RATE};
+
+    // Loop, grabbing recorded data and running the recognition model on it.
+    while (shouldContinueRecognition) {
+      // The recording thread places data in this round-robin buffer, so lock to
+      // make sure there's no writing happening and then copy it to our own
+      // local version.
+      recordingBufferLock.lock();
+      try {
+        int maxLength = recordingBuffer.length;
+        int firstCopyLength = maxLength - recordingOffset;
+        int secondCopyLength = recordingOffset;
+        System.arraycopy(recordingBuffer, recordingOffset, inputBuffer, 0, firstCopyLength);
+        System.arraycopy(recordingBuffer, 0, inputBuffer, firstCopyLength, secondCopyLength);
+      } finally {
+        recordingBufferLock.unlock();
+      }
+
+      // We need to feed in float values between -1.0f and 1.0f, so divide the
+      // signed 16-bit inputs.
+      for (int i = 0; i < RECORDING_LENGTH; ++i) {
+        floatInputBuffer[i] = inputBuffer[i] / 32767.0f;
+      }
+
+      // Run the model.
+      inferenceInterface.feed(SAMPLE_RATE_NAME, sampleRateList);
+      inferenceInterface.feed(INPUT_DATA_NAME, floatInputBuffer, RECORDING_LENGTH, 1);
+      inferenceInterface.run(outputScoresNames);
+      inferenceInterface.fetch(OUTPUT_SCORES_NAME, outputScores);
+
+      // Use the smoother to figure out if we've had a real recognition event.
+      long currentTime = System.currentTimeMillis();
+      final RecognizeCommands.RecognitionResult result =
+          recognizeCommands.processLatestResults(outputScores, currentTime);
+
+      runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              // If we do have a new command, highlight the right list entry.
+              if (!result.foundCommand.startsWith("_") && result.isNewCommand) {
+                int labelIndex = -1;
+                for (int i = 0; i < labels.size(); ++i) {
+                  if (labels.get(i).equals(result.foundCommand)) {
+                    labelIndex = i;
+                  }
+                }
+                final View labelView = (View) labelsListView.getChildAt(labelIndex - 2);
+                ValueAnimator colorAnimation =
+                     ValueAnimator.ofArgb(0x00b3ccff, 0xffb3ccff, 0x00b3ccff);
+                colorAnimation.setDuration(750);
+                colorAnimation.addUpdateListener(
+                    new ValueAnimator.AnimatorUpdateListener() {
+                      @Override
+                      public void onAnimationUpdate(ValueAnimator animator) {
+                        labelView.setBackgroundColor((int) animator.getAnimatedValue());
+                      }
+                    });
+                colorAnimation.start();
+              }
+            }
+          });
+      try {
+        // We don't need to run too frequently, so snooze for a bit.
+        Thread.sleep(MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+      } catch (InterruptedException e) {
+        // Ignore
+      }
+    }
+
+    Log.v(LOG_TAG, "End recognition");
+  }
+}

From 107d165d9bf76a6ff16dc6ca2e2f07dd96c4ad50 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 10:34:26 -0700
Subject: [PATCH 18/70] Use 2-arg TraceMe constructor to prevent unnecessary
 StrCat computation when tracing is disabled.

PiperOrigin-RevId: 165722280
---
 tensorflow/core/kernels/captured_function.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/captured_function.cc b/tensorflow/core/kernels/captured_function.cc
index 15e9680f262..fb9fd80e6c2 100644
--- a/tensorflow/core/kernels/captured_function.cc
+++ b/tensorflow/core/kernels/captured_function.cc
@@ -106,7 +106,7 @@ Status CapturedFunction::Create(
 Status CapturedFunction::Run(FunctionLibraryRuntime::Options f_opts,
                              gtl::ArraySlice<Tensor> args,
                              std::vector<Tensor>* rets, const string& prefix) {
-  port::Tracing::TraceMe activity(strings::StrCat(prefix, "::Run"));
+  port::Tracing::TraceMe activity(prefix, "::Run");
   Notification n;
   Status s;
   auto done_callback = [&n, &s](Status func_status) {

From aaabf6b9026963e8808d500751c26f9429c9c42f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 10:46:11 -0700
Subject: [PATCH 19/70] Fix bug: Using a ComputationDataHandle from the wrong
 ComputationBuilder.

PiperOrigin-RevId: 165724017
---
 tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index df293137ea7..8a67c0b67fc 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -139,7 +139,7 @@ xla::ComputationDataHandle XlaComputeScatterAddDynamicSlice(
         out_index);
 
     auto ip1 = bodyb.Add(i, bodyb.ConstantR0<int32>(1));
-    bodyb.Tuple({ip1, data, indices_1d, updated_output});
+    bodyb.Tuple({ip1, data, idcs, updated_output});
   }
   auto body_status = bodyb.Build();
   // TF_CHECK_OK(body_status);

From 80bd004cdcac67468d69a407665b8d552efab3a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 10:59:28 -0700
Subject: [PATCH 20/70] Implements SVDF model for keyword spotting tutorial.

PiperOrigin-RevId: 165725938
---
 .../docs_src/tutorials/audio_recognition.md   |  51 ++++-
 tensorflow/examples/speech_commands/freeze.py |  20 +-
 .../examples/speech_commands/freeze_test.py   |   4 +-
 .../generate_streaming_test_wav.py            |   2 +-
 tensorflow/examples/speech_commands/models.py | 192 +++++++++++++++++-
 .../test_streaming_accuracy.cc                |   7 +-
 tensorflow/examples/speech_commands/train.py  |   2 +-
 7 files changed, 257 insertions(+), 21 deletions(-)

diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 2caa3ec0d2d..bc3fd20848a 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -526,14 +526,19 @@ The default model used for this script is pretty large, taking over 800 million
 FLOPs for each inference and using 940,000 weight parameters. This runs at
 usable speeds on desktop machines or modern phones, but it involves too many
 calculations to run at interactive speeds on devices with more limited
-resources. To support these use cases, there's an alternative model available,
-based on the 'cnn-one-fstride4' architecture described in the [Convolutional
+resources. To support these use cases, there's a couple of alternatives
+available:
+
+
+**low_latency_conv**
+Based on the 'cnn-one-fstride4' topology described in the [Convolutional
 Neural Networks for Small-footprint Keyword Spotting
 paper](http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf).
-The number of weight parameters is about the same, but it only needs 11 million
-FLOPs to run one prediction, making it much faster.
+The accuracy is slightly lower than 'conv' but the number of weight parameters
+is about the same, and it only needs 11 million FLOPs to run one prediction,
+making it much faster.
 
-To use this model, you can specify `--model_architecture=low_latency_conv` on
+To use this model, you specify `--model_architecture=low_latency_conv` on
 the command line. You'll also need to update the training rates and the number
 of steps, so the full command will look like:
 
@@ -547,6 +552,42 @@ python tensorflow/examples/speech_commands/train \
 This asks the script to train with a learning rate of 0.01 for 20,000 steps, and
 then do a fine-tuning pass of 6,000 steps with a 10x smaller rate.
 
+**low_latency_svdf**
+Based on the topology presented in the [Compressing Deep Neural Networks using a
+Rank-Constrained Topology paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43813.pdf).
+The accuracy is also lower than 'conv' but it only uses about 750 thousand
+parameters, and most significantly, it allows for an optimized execution at
+test time (i.e. when you will actually use it in your application), resulting
+in 750 thousand FLOPs.
+
+To use this model, you specify `--model_architecture=low_latency_svdf` on
+the command line, and update the training rates and the number
+of steps, so the full command will look like:
+
+```
+python tensorflow/examples/speech_commands/train \
+--model_architecture=low_latency_svdf \
+--how_many_training_steps=100000,35000 \
+--learning_rate=0.01,0.005
+```
+
+Note that despite requiring a larger number of steps than the previous two
+topologies, the reduced number of computations means that training should take
+about the same time, and at the end reach an accuracy of around 85%.
+You can also further tune the topology fairly easily for computation and
+accuracy by changing these parameters in the SVDF layer:
+
+* rank - The rank of the approximation (higher typically better, but results in
+         more computation).
+* num_units - Similar to other layer types, specifies the number of nodes in
+              the layer (more nodes better quality, and more computation).
+
+Regarding runtime, since the layer allows optimizations by caching some of the
+internal neural network activations, you need to make sure to use a consistent
+stride (e.g. 'clip_stride_ms' flag) both when you freeze the graph, and when
+executing the model in streaming mode (e.g. test_streaming_accuracy.cc).
+
+**Other parameters to customize**
 If you want to experiment with customizing models, a good place to start is by
 tweaking the spectrogram creation parameters. This has the effect of altering
 the size of the input image to the model, and the creation code in
diff --git a/tensorflow/examples/speech_commands/freeze.py b/tensorflow/examples/speech_commands/freeze.py
index 6d2f2102625..9dbf58d1eba 100644
--- a/tensorflow/examples/speech_commands/freeze.py
+++ b/tensorflow/examples/speech_commands/freeze.py
@@ -53,7 +53,7 @@ FLAGS = None
 
 
 def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
-                           window_size_ms, window_stride_ms,
+                           clip_stride_ms, window_size_ms, window_stride_ms,
                            dct_coefficient_count, model_architecture):
   """Creates an audio model with the nodes needed for inference.
 
@@ -64,6 +64,7 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
     wanted_words: Comma-separated list of the words we're trying to recognize.
     sample_rate: How many samples per second are in the input audio files.
     clip_duration_ms: How many samples to analyze for the audio pattern.
+    clip_stride_ms: How often to run recognition. Useful for models with cache.
     window_size_ms: Time slice duration to estimate frequencies from.
     window_stride_ms: How far apart time slices should be.
     dct_coefficient_count: Number of frequency bands to analyze.
@@ -74,6 +75,7 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
   model_settings = models.prepare_model_settings(
       len(words_list), sample_rate, clip_duration_ms, window_size_ms,
       window_stride_ms, dct_coefficient_count)
+  runtime_settings = {'clip_stride_ms': clip_stride_ms}
 
   wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
   decoded_sample_data = contrib_audio.decode_wav(
@@ -97,7 +99,8 @@ def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
   ])
 
   logits = models.create_model(
-      reshaped_input, model_settings, model_architecture, is_training=False)
+      reshaped_input, model_settings, model_architecture, is_training=False,
+      runtime_settings=runtime_settings)
 
   # Create an output to use for inference.
   tf.nn.softmax(logits, name='labels_softmax')
@@ -108,9 +111,9 @@ def main(_):
   # Create the model and load its weights.
   sess = tf.InteractiveSession()
   create_inference_graph(FLAGS.wanted_words, FLAGS.sample_rate,
-                         FLAGS.clip_duration_ms, FLAGS.window_size_ms,
-                         FLAGS.window_stride_ms, FLAGS.dct_coefficient_count,
-                         FLAGS.model_architecture)
+                         FLAGS.clip_duration_ms, FLAGS.clip_stride_ms,
+                         FLAGS.window_size_ms, FLAGS.window_stride_ms,
+                         FLAGS.dct_coefficient_count, FLAGS.model_architecture)
   models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
 
   # Turn all the variables into inline constants inside the graph and save it.
@@ -136,10 +139,15 @@ if __name__ == '__main__':
       type=int,
       default=1000,
       help='Expected duration in milliseconds of the wavs',)
+  parser.add_argument(
+      '--clip_stride_ms',
+      type=int,
+      default=30,
+      help='How often to run recognition. Useful for models with cache.',)
   parser.add_argument(
       '--window_size_ms',
       type=float,
-      default=20.0,
+      default=30.0,
       help='How long each spectrogram timeslice is',)
   parser.add_argument(
       '--window_stride_ms',
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index 3386f0f282c..97c6eac675f 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -26,8 +26,8 @@ class FreezeTest(test.TestCase):
 
   def testCreateInferenceGraph(self):
     with self.test_session() as sess:
-      freeze.create_inference_graph('a,b,c,d', 16000, 1000.0, 20.0, 10.0, 40,
-                                    'conv')
+      freeze.create_inference_graph('a,b,c,d', 16000, 1000.0, 30.0, 30.0, 10.0,
+                                    40, 'conv')
       self.assertIsNotNone(sess.graph.get_tensor_by_name('wav_data:0'))
       self.assertIsNotNone(
           sess.graph.get_tensor_by_name('decoded_sample_data:0'))
diff --git a/tensorflow/examples/speech_commands/generate_streaming_test_wav.py b/tensorflow/examples/speech_commands/generate_streaming_test_wav.py
index 53c540b6460..185beaf3d89 100644
--- a/tensorflow/examples/speech_commands/generate_streaming_test_wav.py
+++ b/tensorflow/examples/speech_commands/generate_streaming_test_wav.py
@@ -234,7 +234,7 @@ if __name__ == '__main__':
   parser.add_argument(
       '--window_size_ms',
       type=float,
-      default=20.0,
+      default=30.0,
       help='How long each spectrogram timeslice is',)
   parser.add_argument(
       '--window_stride_ms',
diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index 9eafb933fb1..82d6a94ea1b 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -62,7 +62,7 @@ def prepare_model_settings(label_count, sample_rate, clip_duration_ms,
 
 
 def create_model(fingerprint_input, model_settings, model_architecture,
-                 is_training):
+                 is_training, runtime_settings=None):
   """Builds a model of the requested architecture compatible with the settings.
 
   There are many possible ways of deriving predictions from a spectrogram
@@ -86,6 +86,7 @@ def create_model(fingerprint_input, model_settings, model_architecture,
     model_settings: Dictionary of information about the model.
     model_architecture: String specifying which kind of model to create.
     is_training: Whether the model is going to be used for training.
+    runtime_settings: Dictionary of information about the runtime.
 
   Returns:
     TensorFlow node outputting logits results, and optionally a dropout
@@ -102,10 +103,13 @@ def create_model(fingerprint_input, model_settings, model_architecture,
   elif model_architecture == 'low_latency_conv':
     return create_low_latency_conv_model(fingerprint_input, model_settings,
                                          is_training)
+  elif model_architecture == 'low_latency_svdf':
+    return create_low_latency_svdf_model(fingerprint_input, model_settings,
+                                         is_training, runtime_settings)
   else:
     raise Exception('model_architecture argument "' + model_architecture +
                     '" not recognized, should be one of "single_fc", "conv",' +
-                    ' or "low_latency_conv"')
+                    ' "low_latency_conv, or "low_latency_svdf"')
 
 
 def load_variables_from_checkpoint(sess, start_checkpoint):
@@ -376,3 +380,187 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
     return final_fc, dropout_prob
   else:
     return final_fc
+
+
+def create_low_latency_svdf_model(fingerprint_input, model_settings,
+                                  is_training, runtime_settings):
+  """Builds an SVDF model with low compute requirements.
+
+  This is based in the topology presented in the 'Compressing Deep Neural
+  Networks using a Rank-Constrained Topology' paper:
+  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43813.pdf
+
+  Here's the layout of the graph:
+
+  (fingerprint_input)
+          v
+        [SVDF]<-(weights)
+          v
+      [BiasAdd]<-(bias)
+          v
+        [Relu]
+          v
+      [MatMul]<-(weights)
+          v
+      [BiasAdd]<-(bias)
+          v
+      [MatMul]<-(weights)
+          v
+      [BiasAdd]<-(bias)
+          v
+      [MatMul]<-(weights)
+          v
+      [BiasAdd]<-(bias)
+          v
+
+  This model produces lower recognition accuracy than the 'conv' model above,
+  but requires fewer weight parameters and, significantly fewer computations.
+
+  During training, dropout nodes are introduced after the relu, controlled by a
+  placeholder.
+
+  Args:
+    fingerprint_input: TensorFlow node that will output audio feature vectors.
+    The node is expected to produce a 2D Tensor of shape:
+      [batch, model_settings['dct_coefficient_count'] *
+              model_settings['spectrogram_length']]
+    with the features corresponding to the same time slot arranged contiguously,
+    and the oldest slot at index [:, 0], and newest at [:, -1].
+    model_settings: Dictionary of information about the model.
+    is_training: Whether the model is going to be used for training.
+    runtime_settings: Dictionary of information about the runtime.
+
+  Returns:
+    TensorFlow node outputting logits results, and optionally a dropout
+    placeholder.
+
+  Raises:
+      ValueError: If the inputs tensor is incorrectly shaped.
+  """
+  if is_training:
+    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
+
+  input_frequency_size = model_settings['dct_coefficient_count']
+  input_time_size = model_settings['spectrogram_length']
+
+  # Validation.
+  input_shape = fingerprint_input.get_shape()
+  if len(input_shape) != 2:
+    raise ValueError('Inputs to `SVDF` should have rank == 2.')
+  if input_shape[-1].value is None:
+    raise ValueError('The last dimension of the inputs to `SVDF` '
+                     'should be defined. Found `None`.')
+  if input_shape[-1].value % input_frequency_size != 0:
+    raise ValueError('Inputs feature dimension %d must be a multiple of '
+                     'frame size %d', fingerprint_input.shape[-1].value,
+                     input_frequency_size)
+
+  # Set number of units (i.e. nodes) and rank.
+  rank = 2
+  num_units = 1280
+  # Number of filters: pairs of feature and time filters.
+  num_filters = rank * num_units
+  # Create the runtime memory: [num_filters, batch, input_time_size]
+  batch = 1
+  memory = tf.Variable(tf.zeros([num_filters, batch, input_time_size]),
+                       trainable=False, name='runtime-memory')
+  # Determine the number of new frames in the input, such that we only operate
+  # on those. For training we do not use the memory, and thus use all frames
+  # provided in the input.
+  # new_fingerprint_input: [batch, num_new_frames*input_frequency_size]
+  if is_training:
+    num_new_frames = input_time_size
+  else:
+    window_stride_ms = int(model_settings['window_stride_samples'] * 1000 /
+                           model_settings['sample_rate'])
+    num_new_frames = tf.cond(
+        tf.equal(tf.count_nonzero(memory), 0),
+        lambda: input_time_size,
+        lambda: int(runtime_settings['clip_stride_ms'] / window_stride_ms))
+  new_fingerprint_input = fingerprint_input[
+      :, -num_new_frames*input_frequency_size:]
+  # Expand to add input channels dimension.
+  new_fingerprint_input = tf.expand_dims(new_fingerprint_input, 2)
+
+  # Create the frequency filters.
+  weights_frequency = tf.Variable(
+      tf.truncated_normal([input_frequency_size, num_filters], stddev=0.01))
+  # Expand to add input channels dimensions.
+  # weights_frequency: [input_frequency_size, 1, num_filters]
+  weights_frequency = tf.expand_dims(weights_frequency, 1)
+  # Convolve the 1D feature filters sliding over the time dimension.
+  # activations_time: [batch, num_new_frames, num_filters]
+  activations_time = tf.nn.conv1d(
+      new_fingerprint_input, weights_frequency, input_frequency_size, 'VALID')
+  # Rearrange such that we can perform the batched matmul.
+  # activations_time: [num_filters, batch, num_new_frames]
+  activations_time = tf.transpose(activations_time, perm=[2, 0, 1])
+
+  # Runtime memory optimization.
+  if not is_training:
+    # We need to drop the activations corresponding to the oldest frames, and
+    # then add those corresponding to the new frames.
+    new_memory = memory[:, :, num_new_frames:]
+    new_memory = tf.concat([new_memory, activations_time], 2)
+    tf.assign(memory, new_memory)
+    activations_time = new_memory
+
+  # Create the time filters.
+  weights_time = tf.Variable(
+      tf.truncated_normal([num_filters, input_time_size], stddev=0.01))
+  # Apply the time filter on the outputs of the feature filters.
+  # weights_time: [num_filters, input_time_size, 1]
+  # outputs: [num_filters, batch, 1]
+  weights_time = tf.expand_dims(weights_time, 2)
+  outputs = tf.matmul(activations_time, weights_time)
+  # Split num_units and rank into separate dimensions (the remaining
+  # dimension is the input_shape[0] -i.e. batch size). This also squeezes
+  # the last dimension, since it's not used.
+  # [num_filters, batch, 1] => [num_units, rank, batch]
+  outputs = tf.reshape(outputs, [num_units, rank, -1])
+  # Sum the rank outputs per unit => [num_units, batch].
+  units_output = tf.reduce_sum(outputs, axis=1)
+  # Transpose to shape [batch, num_units]
+  units_output = tf.transpose(units_output)
+
+  # Appy bias.
+  bias = tf.Variable(tf.zeros([num_units]))
+  first_bias = tf.nn.bias_add(units_output, bias)
+
+  # Relu.
+  first_relu = tf.nn.relu(first_bias)
+
+  if is_training:
+    first_dropout = tf.nn.dropout(first_relu, dropout_prob)
+  else:
+    first_dropout = first_relu
+
+  first_fc_output_channels = 256
+  first_fc_weights = tf.Variable(
+      tf.truncated_normal([num_units, first_fc_output_channels], stddev=0.01))
+  first_fc_bias = tf.Variable(tf.zeros([first_fc_output_channels]))
+  first_fc = tf.matmul(first_dropout, first_fc_weights) + first_fc_bias
+  if is_training:
+    second_fc_input = tf.nn.dropout(first_fc, dropout_prob)
+  else:
+    second_fc_input = first_fc
+  second_fc_output_channels = 256
+  second_fc_weights = tf.Variable(
+      tf.truncated_normal(
+          [first_fc_output_channels, second_fc_output_channels], stddev=0.01))
+  second_fc_bias = tf.Variable(tf.zeros([second_fc_output_channels]))
+  second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias
+  if is_training:
+    final_fc_input = tf.nn.dropout(second_fc, dropout_prob)
+  else:
+    final_fc_input = second_fc
+  label_count = model_settings['label_count']
+  final_fc_weights = tf.Variable(
+      tf.truncated_normal(
+          [second_fc_output_channels, label_count], stddev=0.01))
+  final_fc_bias = tf.Variable(tf.zeros([label_count]))
+  final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
+  if is_training:
+    return final_fc, dropout_prob
+  else:
+    return final_fc
diff --git a/tensorflow/examples/speech_commands/test_streaming_accuracy.cc b/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
index 6db5b69f9bb..5a98264401e 100644
--- a/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
+++ b/tensorflow/examples/speech_commands/test_streaming_accuracy.cc
@@ -139,7 +139,7 @@ int main(int argc, char* argv[]) {
   string input_rate_name = "decoded_sample_data:1";
   string output_name = "labels_softmax";
   int32 clip_duration_ms = 1000;
-  int32 sample_stride_ms = 30;
+  int32 clip_stride_ms = 30;
   int32 average_window_ms = 500;
   int32 time_tolerance_ms = 750;
   int32 suppression_ms = 1500;
@@ -165,8 +165,7 @@ int main(int argc, char* argv[]) {
            "maximum gap allowed between a recognition and ground truth"),
       Flag("suppression_ms", &suppression_ms,
            "how long to ignore others for after a recognition"),
-      Flag("sample_stride_ms", &sample_stride_ms,
-           "how often to run recognition"),
+      Flag("clip_stride_ms", &clip_stride_ms, "how often to run recognition"),
       Flag("detection_threshold", &detection_threshold,
            "what score is required to trigger detection of a word"),
       Flag("verbose", &verbose, "whether to log extra debugging information"),
@@ -232,7 +231,7 @@ int main(int argc, char* argv[]) {
   }
 
   const int64 clip_duration_samples = (clip_duration_ms * sample_rate) / 1000;
-  const int64 sample_stride_samples = (sample_stride_ms * sample_rate) / 1000;
+  const int64 sample_stride_samples = (clip_stride_ms * sample_rate) / 1000;
   Tensor audio_data_tensor(tensorflow::DT_FLOAT,
                            tensorflow::TensorShape({clip_duration_samples, 1}));
 
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index 925607a1fb8..8298a90b44d 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -355,7 +355,7 @@ if __name__ == '__main__':
   parser.add_argument(
       '--window_size_ms',
       type=float,
-      default=20.0,
+      default=30.0,
       help='How long each spectrogram timeslice is',)
   parser.add_argument(
       '--window_stride_ms',

From 360bff8ae51f2341e3586e247195cc80a4a3933f Mon Sep 17 00:00:00 2001
From: Ali Yahya <alive@google.com>
Date: Fri, 18 Aug 2017 11:02:26 -0700
Subject: [PATCH 21/70] Makes tape.watch() work with ResourceVariables. To this
 end, also adds a property, `device`, to TensorNode.

PiperOrigin-RevId: 165726368
---
 tensorflow/python/eager/BUILD          | 9 ++-------
 tensorflow/python/eager/tape.py        | 5 +++++
 tensorflow/python/eager/tensor_node.py | 1 +
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 8ce5865af4a..ec788ed3fa1 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -55,13 +55,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":context",
-        ":core",
-        ":tape",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:framework_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -88,6 +82,7 @@ py_library(
     deps = [
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index f2915eba59d..5a0959a75e7 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -25,6 +25,7 @@ from autograd import core as ag_core
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 
@@ -143,6 +144,10 @@ def watch(tensor):
   Returns:
     The tensor, potentially wrapped by all tapes in the stack.
   """
+  if isinstance(tensor, resource_variable_ops.ResourceVariable):
+    tensor._handle = watch(tensor.handle)  # pylint: disable=protected-access
+    return tensor
+
   for t in _tape_stack.stack:
     tensor = _watch_with_tape(t, tensor)
   return tensor
diff --git a/tensorflow/python/eager/tensor_node.py b/tensorflow/python/eager/tensor_node.py
index 976a03d7b54..042141e2198 100644
--- a/tensorflow/python/eager/tensor_node.py
+++ b/tensorflow/python/eager/tensor_node.py
@@ -60,6 +60,7 @@ class TensorNode(ag_core.Node):
 
   shape = property(lambda self: self.value.shape)
   dtype = property(lambda self: self.value.dtype)
+  device = property(lambda self: self.value.device)
 
   def get_shape(self):
     return self.shape

From e6c60fb368aa1261663cef1204e515179c36db8d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 11:18:09 -0700
Subject: [PATCH 22/70] Fix flakyness, sometimes the op takes ms to run.

PiperOrigin-RevId: 165728705
---
 tensorflow/python/profiler/model_analyzer_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 7c0080c2065..9492cadb8b8 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -95,7 +95,7 @@ class PrintModelAnalysisTest(test.TestCase):
             # Make sure time is profiled.
             gap = 1 if test.is_gpu_available() else 2
             for i in range(3, 6, gap):
-              mat = re.search('(.*)us/(.*)us', metrics[i])
+              mat = re.search('(.*)[um]s/(.*)[um]s', metrics[i])
               self.assertGreater(float(mat.group(1)), 0.0)
               self.assertGreater(float(mat.group(2)), 0.0)
             # Make sure device is profiled.

From d001b58de9d0d99ef34637b9a9bed5763875f655 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 18 Aug 2017 11:41:47 -0700
Subject: [PATCH 23/70] [tf.contrib.data] Fix handling of multi-output
 tf.py_func() in Dataset.map().

If the `map_func` returns a list of tensors, the current code will
attempt to stack it into a single tensor and raise an unintuitive
error. Some multi-output ops (such as `tf.py_func()`) return lists of
typically-not-stackable tensors. This change treats lists returned
from `map_func` as tuples; users who were relying on this
auto-stacking behavior should manually call `tf.stack()` (or
`tf.convert_to_tensor()`) on the list being returned.

Fixes #12396.

PiperOrigin-RevId: 165731970
---
 .../kernel_tests/map_dataset_op_test.py       | 35 +++++++++++++++++++
 .../contrib/data/python/ops/dataset_ops.py    | 13 +++++++
 2 files changed, 48 insertions(+)

diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index 99b00acd108..c75b4e43ef0 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -549,6 +549,41 @@ class MapDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  def testReturnList(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(lambda x: [x, constant_op.constant(37.0)])
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, 37.0), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testMultiOutputPyFunc(self):
+    # The `tf.py_func()` op returns a list of tensors for its outputs.
+    def _map_fn(x_tensor):
+      def _map_py_func(x):
+        return x, np.array(37.0, dtype=np.float64)
+      return script_ops.py_func(
+          _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
+
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(_map_fn)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, 37.0), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index ed3359730c4..ff3263384c4 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -1991,6 +1991,19 @@ class MapDataset(Dataset):
       else:
         ret = map_func(nested_args)
 
+      # If `map_func` returns a list of tensors, `nest.flatten()` and
+      # `ops.convert_to_tensor()` would conspire to attempt to stack
+      # those tensors into a single tensor, because the customized
+      # version of `nest.flatten()` does not recurse into lists. Since
+      # it is more likely that the list arose from returning the
+      # result of an operation (such as `tf.py_func()`) that returns a
+      # list of not-necessarily-stackable tensors, we treat the
+      # returned value is a `tuple` instead. A user wishing to pack
+      # the return value into a single tensor can use an explicit
+      # `tf.stack()` before returning.
+      if isinstance(ret, list):
+        ret = tuple(ret)
+
       # Extract shape information from the returned values.
       flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)]
       self._output_shapes = nest.pack_sequence_as(

From 5f5c3eb0abe9142410c23f450656588254a10634 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Fri, 18 Aug 2017 11:42:14 -0700
Subject: [PATCH 24/70] Move "supervisor.md" from programmer's guide to
 api_guides.

PiperOrigin-RevId: 165732026
---
 .../docs_src/programmers_guide/index.md       |   3 -
 .../docs_src/programmers_guide/supervisor.md  | 402 ------------------
 2 files changed, 405 deletions(-)
 delete mode 100644 tensorflow/docs_src/programmers_guide/supervisor.md

diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 214f3028e07..22fe229422a 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -35,9 +35,6 @@ The units are now as follows:
     Embedding Projector.
   * @{$programmers_guide/debugger$Debugging TensorFlow Programs}, which
     explains how to use the TensorFlow debugger (tfdbg).
-  * @{$programmers_guide/supervisor$Supervisor: Training Helper for Days-Long Trainings},
-    which explains how to gracefully handle system crashes during lengthy
-    training sessions.  (We have not revised this document for v1.3.)
   * @{$programmers_guide/version_compat$TensorFlow Version Compatibility},
     which explains backward compatibility guarantees and non-guarantees.
   * @{$programmers_guide/faq$FAQ}, which contains frequently asked
diff --git a/tensorflow/docs_src/programmers_guide/supervisor.md b/tensorflow/docs_src/programmers_guide/supervisor.md
deleted file mode 100644
index ec7c91b1472..00000000000
--- a/tensorflow/docs_src/programmers_guide/supervisor.md
+++ /dev/null
@@ -1,402 +0,0 @@
-# Supervisor: Training Helper for Days-Long Trainings.
-
-To train a model with TensorFlow you can simply run a training op a number of
-times and save a checkpoint of the trained parameters when you're done.  This
-works well for small models that can train in a few hours.
-
-Larger models that require days of training, possibly across multiple replicas,
-need a more robust training process that:
-
- * Handles shutdowns and crashes cleanly.
- * Can be resumed after a shutdown or a crash.
- * Can be monitored through TensorBoard.
-
-To be able to resume training after a shutdown or a crash the training process
-must save checkpoints regularly.  On restart, it must look for the most recent
-checkpoint and load it before resuming training.
-
-To be monitored through TensorBoard, the training process must run summary ops
-regularly and append the returned values to an events file as explained in
-@{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
-TensorBoard monitors events files and displays graphs reporting training
-progress over time.
-
-The @{tf.train.Supervisor} provides
-a set of services that helps implement a robust training process.
-
-This how-to shows how to use the supervisor directly.  Please also consider
-using one of several frameworks built on top of the supervisor that provide
-richer training loops, and numerous customization options:
-@{$python/contrib.learn$`tf.learn`} is a good choice.
-
-Note that the supervisor is very helpful for training large models, but can
-also be used for smaller models without any penalty.
-
-## Very Simple Scenario
-
-The simplest scenario for using a supervisor is to:
-
- * Create a `Supervisor` object, passing it the path to a directory where to
-   save checkpoints and summaries.
-
- * Ask the supervisor for a session with
-   @{tf.train.Supervisor.managed_session}.
-
- * Use the session to execute a train op, checking at each step if the
-   supervisor requests that the training stops.
-
-```python
-  ...create graph...
-  my_train_op = ...
-
-  sv = tf.train.Supervisor(logdir="/my/training/directory")
-  with sv.managed_session() as sess:
-    for step in range(100000):
-      if sv.should_stop():
-        break
-      sess.run(my_train_op)
-```
-
-### Started Services
-
-In the very simple scenario, the `managed_session()` call starts a few
-services, which run in their own threads, and use the managed session to run
-ops in your graph.
-
-If your graph contains an integer variable named `global_step`, the services
-use its value to measure the number of training steps executed.  See the @{$mechanics#training$MNIST training tutorial} for how to
-create a `global_step` variable.
-
- * _Checkpointing_ service: Saves a copy of the graph variables in the logdir.
-   The checkpoint filename uses the value of the `global_step` variable if one
-   was added to your graph.  Runs every 10 minutes by default.
-
- * _Summary_ service: Runs all the summary ops and appends their output to an
-   @{$summaries_and_tensorboard$events file} in the logdir.  Runs
-   every 2 minutes by default.
-
- * _Step counter_: Counts how many steps have been executed, by looking at
-   changes in the `global_step` variable.  Appends a summary to the events file
-   reporting the number of global steps per second.  The summary tag is
-   "global_step/sec".  This also runs every 2 minutes by default.
-
- * _Queue Runners_: If any @{tf.train.QueueRunner} were added to the
-   graph, the supervisor launches them in their own threads.
-
-All time intervals can be changed when constructing the supervisor object.  See
-the [supervisor reference](#supervisor_reference) for details.
-
-### Checking for Stop
-
-The check for stop in the main training loop is important and necessary.
-
-Exceptions raised in the service threads are reported to the supervisor which
-then sets its `should_stop()` condition to true.  Other service threads notice
-that condition and terminate properly.  The main training loop, within the
-`managed_session()` block, must also check for the stop condition and
-terminate.
-
-Note that `managed_session()` takes care of catching exceptions raised from the
-training loop to report them to the supervisor.  The main loop does not need to
-do anything special about exceptions.  It only needs to check for the stop
-condition.
-
-### Recovery
-
-If the training program shuts down or crashes, its most recent checkpoint and
-event files are left in the logdir.  When you restart the program,
-`managed_session()` restores the graph from the most recent checkpoint and
-resumes training where it stopped.
-
-A new events file is created.  If you start TensorBoard and point it to the
-logdir, it will know how to merge the contents of the two events files and will
-show the training resuming at the last global step from the checkpoint.
-
-## Larger Model Scenario
-
-The very simple scenario is sufficient for most small to medium sized models.
-Larger models may run out memory when the summary service runs: The summary ops
-are run in parallel with the main loop running the train op.  This can cause
-memory usage to peak to up to two times the normal use.
-
-For a larger model you can tell the supervisor to not run the summary service
-and instead run it yourself in your main training loop: pass `summary_op=None`
-when constructing the supervisor.
-
-For example this code runs the summary op every 100 steps in the training loop:
-
-```python
-  ...create graph...
-  my_train_op = ...
-  my_summary_op = tf.summary.merge_all()
-
-  sv = tf.train.Supervisor(logdir="/my/training/directory",
-                     summary_op=None) # Do not run the summary service
-  with sv.managed_session() as sess:
-    for step in range(100000):
-      if sv.should_stop():
-        break
-      if step % 100 == 0:
-        _, summ = sess.run([my_train_op, my_summary_op])
-        sv.summary_computed(sess, summ)
-      else:
-        sess.run(my_train_op)
-```
-
-## Pre-trained Model Scenario
-
-The `managed_session()` call takes care of initializing the model in the
-session.  The model is restored from a checkpoint if one is available,
-or initialized from scratch otherwise.
-
-One common scenario is to initialize the model by loading a "pre-trained"
-checkpoint that was saved while training a usually slightly different model
-using a different dataset.
-
-You can load a pre-trained checkpoint by passing an "init function" to the
-supervisor.  This function is called only if the model needs to be initialized
-from scratch, not when the model can be recovered from a checkpoint from the
-logdir.
-
-To load the pre-trained model, the init function needs a
-@{tf.train.Saver} object, so you should create
-a saver for this purpose.  This is usually a good idea because the new model
-may contain variables that are not present in the pre-trained checkpoint: This
-saver must only restore the pre-trained variables.  If you were using the
-default saver, you could get an error trying to restore all the variables of
-the new model from the pre-trained checkpoint.
-
-```python
-  ...create graph...
-  # Create a saver that restores only the pre-trained variables.
-  pre_train_saver = tf.train.Saver([pre_train_var1, pre_train_var2])
-
-  # Define an init function that loads the pretrained checkpoint.
-  def load_pretrain(sess):
-    pre_train_saver.restore(sess, "<path to pre-trained-checkpoint>")
-
-  # Pass the init function to the supervisor.
-  #
-  # The init function is called _after_ the variables have been initialized
-  # by running the init_op.
-  sv = tf.train.Supervisor(logdir="/my/training/directory",
-                     init_fn=load_pretrain)
-  with sv.managed_session() as sess:
-    # Here sess was either initialized from the pre-trained-checkpoint or
-    # recovered from a checkpoint saved in a previous run of this code.
-    ...
-```
-
-## Running Your Own Services
-
-Supervisor services, such as the checkpointing service, run in threads parallel
-to the main training loop.  You sometimes want to add your own services, for
-example to fetch different sets of summaries on a different schedule than the
-usual summary service.
-
-Use the @{tf.train.Supervisor.loop} method of
-the supervisor for this purpose.  It repeatedly calls a function of your choice
-on a timer until the supervisor stop condition becomes true, so it plays nicely
-with the other services.
-
-Example: Call `my_additional_summaries()` every 20mn:
-
-```python
-
-def my_additional_summaries(sv, sess):
- ...fetch and write summaries, see below...
-
-...
-  sv = tf.train.Supervisor(logdir="/my/training/directory")
-  with sv.managed_session() as sess:
-    # Call my_additional_summaries() every 1200s, or 20mn,
-    # passing (sv, sess) as arguments.
-    sv.loop(1200, my_additional_summaries, args=(sv, sess))
-    ...main training loop...
-```
-
-## Writing Summaries
-
-The supervisor always creates an events file in its logdir, as well as a
-@{tf.summary.FileWriter} to append
-events and summaries to that file.  If you want to write your own summaries it
-is a good idea to append them to that same events file: TensorBoard likes it
-better when only one events file in a directory is being actively appended to.
-
-The supervisor provides a helper function to append summaries:
-@{tf.train.Supervisor.summary_computed}.
-Just pass to the function the output returned by a summary op.  Here is an
-example of using that function to implement `my_additional_summaries()` from the
-previous example:
-
-```python
-def my_additional_summaries(sv, sess):
-  summaries = sess.run(my_additional_summary_op)
-  sv.summary_computed(sess, summaries)
-```
-
-For more advanced usages, the supervisor provides access to its summary writer
-through its
-@{tf.train.Supervisor.summary_writer}
-attribute.
-
-## Supervisor Reference
-
-The [Very Simple Scenario](#very_simple_scenario), and the [Larger Model
-Scenario](#larger_model_scenario) show basic uses of a supervisor.  More
-advanced scenarios can be constructed by using the many options provided by the
-supervisor
-
-### Checkpointing: Where and When.
-
-The `managed_session()` call launches the checkpointing service, which can be
-configured by the following keyword arguments to the `Supervisor()`
-constructor:
-
- * `logdir`: path to a directory where the checkpointing service creates
-   checkpoints.  The directory is created if needed.  Passing `None` disables
-   the checkpointing and the summary services.
-
- * `checkpoint_basename`: Name of the checkpoint files to create, defaults to
-   "model.ckpt".
-
-   If the model contains a scalar integer variable named `global_step`, the
-   value of that variable is appended to the checkpoint filename.
-
-   For example, at global step 1234 the checkpoint filename is
-   "model.ckpt-1234".
-
- * `save_model_secs`: Number of seconds between each checkpoint.  Defaults to
-   600, or 10 minutes.
-
-   When choosing a value, consider how much work you want to lose in case of a
-   crash: you will never lose more than `save_model_secs` seconds of work.
-   Setting this to 0 disables the checkpointing service.
-
- * `saver`: A @{tf.train.Saver} object to use
-   for checkpointing.
-
-   If you do not pass one, the supervisor creates one for you by calling
-   `tf.train.Saver()`, which add ops to save and restore all variables in your model.
-   This is usually what you need.
-
-Example: Use a custom Saver and checkpoint every 30 seconds.
-
-```python
-  ...create graph...
-  my_saver = tf.train.Saver(<only some variables>)
-  sv = tf.train.Supervisor(logdir="/my/training/directory",
-                     saver=my_saver,
-                     save_model_secs=30)
-  with sv.managed_session() as sess:
-    ...training loop...
-```
-
-### Summaries: Where and When.
-
-The `managed_session()` call launches the summary service which fetches
-summaries and reports the number of steps executed per second.  It can be
-configured by the following keyword arguments to the `Supervisor()`
-constructor:
-
- * `logdir`: Path to a directory where the summary service creates event files.
-   The directory is created if needed.  Passing `None` disables the summary
-   service as well as the checkpointing services.
-
- * `save_summaries_secs`: Number of seconds between each run of the summary
-   service.  Defaults to 120, or 2 minutes.
-
-   When choosing a value, consider how expensive your summaries are, and how
-   much disk they will occupy.  Pass 0 to disable the summary service.
-
- * `summary_op`: Op to use to fetch the summaries.
-
-   If not specified, the supervisor use the first op in the
-   `tf.GraphKeys.SUMMARY_OP` @{tf.Graph.add_to_collection$graph collection}.  If
-   the collection is empty the supervisor creates an op that aggregates all
-   summaries in the graph using `tf.summary.merge_all()`.
-
-   Passing `None` disables the summary service.
-
- * `global_step`: Tensor to use to count the global step.
-
-   If not specified, the supervisor uses the first tensor in the
-   `tf.GraphKeys.GLOBAL_STEP` @{tf.Graph.add_to_collection$graph collection}.  If
-   the collection is empty, the supervisor looks for a scalar integer variable
-   named `global_step` in the graph.
-
-   If found, the global step tensor is used to measure the number of training
-   steps executed.  Note that your training op is responsible for incrementing
-   the global step value.
-
-### Model Initialization and Recovery
-
-The `managed_session()` call takes care of initializing or recovering a
-session.  It returns a session with a fully initialized model, ready to run
-ops.  If a checkpoint exists in the logdir when `managed_session()` is called,
-the model is initialized by loading that checkpoint, otherwise it is
-initialized by calling an init op and optionally an init function.
-
-When no checkpoint is available, model initialization is controlled by the
-following keyword arguments to the `Supervisor()` constructor:
-
- * `init_op`: Op to run to initialize the model.
-
-   If not specified, the supervisor uses the first op in the
-   `tf.GraphKeys.INIT_OP` collection.  If the collection is empty, the
-   supervisor adds an op to initialize all the variables in the graph by
-   calling `tf.global_variables_initializer()`.
-
-   Pass `None` to not use an init op.
-
- * `init_fn`: Python function to call to initialize the model.
-
-   If specified, called as `init_fn(sess)` where `sess` is the managed session.
-   If an init op is also used, the init function is called _after_ the init op.
-
- * `local_init_op`: An additional op to initialize parts of the graph that are
-   not saved in checkpoints such as tables and
-   @{tf.contrib.framework.local_variable$local variables}. The
-   local init op is run _before_ the init op and the init function.
-
-   If not specified, the supervisor uses the first op in the
-   `tf.GraphKeys.LOCAL_INIT_OP` collection.  If the collection is empty the
-   supervisor adds an op to initialize all the tables and local variables in
-   the graph by calling `tf.tables_initializer()` and
-   `tf.local_variables_initializer()`.
-
-   Pass `None` to not use a local init op.
-
- * `ready_op`: Op to check if the model is initialized.
-
-   After running the local init op, the init op, and the init function, the
-   supervisor verifies that the model is fully initialized by running the ready
-   op.  This is an op that returns an empty string if the model is initialized,
-   or a description of what parts of the model are not initialized if not.
-
-   If not specified, the supervisor uses the first op in the
-   `tf.GraphKeys.READY_OP` collection.  If the collection is empty the
-   supervisor creates a ready op that verifies that all variables are
-   initialized by calling `tf.report_uninitialized_variables()`.
-
-   Pass `None` to disable the ready op.  In that case the model is not
-   checked after initialization.
-
-Checkpoint recovery is controlled by the following keyword arguments to the
-`Supervisor()` constructor:
-
- * `logdir`: Path to a directory in which to look for checkpoints.  The
-  checkpoint service saves a metadata file, named "checkpoint", in the
-  checkpoint directory that indicates the path to the most recent checkpoint.
-
-  This file is in text format. When in a pinch, you can edit it manually to
-  recover from a different checkpoint than the most recent one.
-
- * `ready_op`: (see above).  The ready op is run before and after loading the
-   checkpoint.  The first run checks if the model needs to be initialized and
-   the second run verifies that the model is fully initialized.
-
- * `local_init_op`: (see above).  The local init op is run before running the
-   ready op the first time, to initialize local variables and tables.
-
- * `saver`: (see above).  Saver object used to load the checkpoint.

From 32f4c5b6e84971c0b8e52aadeb0d5a85cc74f5e4 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Fri, 18 Aug 2017 12:02:49 -0700
Subject: [PATCH 25/70] [XLA] Add IsFinite op in tf2xla.

PiperOrigin-RevId: 165734702
---
 tensorflow/compiler/tests/unary_ops_test.py   |  7 +++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../compiler/tf2xla/kernels/is_finite_op.cc   | 43 +++++++++++++++++++
 3 files changed, 51 insertions(+)
 create mode 100644 tensorflow/compiler/tf2xla/kernels/is_finite_op.cc

diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 81ff18f3023..cfc2a0c85ec 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -260,6 +260,13 @@ class UnaryOpsTest(XLATestCase):
           np.array([[-2, 0, 8]], dtype=dtype),
           expected=np.array([[0.126928, 0.6931472, 8.0003354]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.is_finite,
+          np.array(
+              [[42, float("inf"), -123], [float("nan"), 0, -0.0]], dtype=dtype),
+          expected=np.array(
+              [[True, False, True], [False, True, True]], dtype=np.bool))
+
   def testNumericOps(self):
     for dtype in self.numeric_types:
       self._assertOpOutputMatchesExpected(
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index fa28568e5a8..4e609fee732 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -31,6 +31,7 @@ tf_kernel_library(
         "function_ops.cc",
         "gather_op.cc",
         "identity_op.cc",
+        "is_finite_op.cc",
         "l2loss_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc b/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc
new file mode 100644
index 00000000000..788dcee5443
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/is_finite_op.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
+
+namespace tensorflow {
+namespace {
+
+class IsFiniteOp : public XlaOpKernel {
+ public:
+  explicit IsFiniteOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle input = ctx->Input(0);
+    ctx->SetOutput(0, ctx->builder()->IsFinite(input));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(IsFiniteOp);
+};
+
+REGISTER_XLA_OP(Name("IsFinite"), IsFiniteOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow

From d0cb32c2a3cb4ab84ac698e278640da7452fe186 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 18 Aug 2017 12:07:55 -0700
Subject: [PATCH 26/70] Docstring for ResourceVariable.

PiperOrigin-RevId: 165735441
---
 .../python/ops/resource_variable_ops.py       | 48 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index c31121b3eb9..9499fe456d1 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -39,9 +39,53 @@ from tensorflow.python.util import compat
 class ResourceVariable(variables.Variable):
   """Variable based on resource handles.
 
-  TODO(apassos): fill this out explaining the semantics and Variable
-  compatibility when the API has settled more.
+  See the ${variables} documentation for more details.
 
+  A `ResourceVariable` allows you to maintain state across subsequent calls to
+  session.run.
+
+  The `ResourceVariable` constructor requires an initial value for the variable,
+  which can be a `Tensor` of any type and shape. The initial value defines the
+  type and shape of the variable. After construction, the type and shape of
+  the variable are fixed. The value can be changed using one of the assign
+  methods.
+
+  Just like any `Tensor`, variables created with `ResourceVariable()` can be
+  used as inputs for other Ops in the graph. Additionally, all the operators
+  overloaded for the `Tensor` class are carried over to variables, so you can
+  also add nodes to the graph by just doing arithmetic on variables.
+
+  Unlike tf.Variable, a tf.ResourceVariable has well-defined semantics. Each
+  usage of a ResourceVariable in a TensorFlow graph adds a read_value operation
+  to the graph. The Tensors returned by a read_value operation are guaranteed
+  to see all modifications to the value of the variable which happen in any
+  operation on which the read_value depends on (either directly, indirectly, or
+  via a control dependency) and guaranteed to not see any modification to the
+  value of the variable on which the read_value operation does not depend on.
+
+  For example, if there is more than one assignment to a ResourceVariable in
+  a single session.run call there is a well-defined value for each operation
+  which uses the variable's value if the assignments and the read are connected
+  by edges in the graph. Consider the following example, in which two writes
+  can cause tf.Variable and tf.ResourceVariable to behave differently:
+
+   ```python
+    a = tf.ResourceVariable(1.0)
+    a.initializer.run()
+
+    assign = a.assign(2.0)
+    with tf.control_dependencies([assign]):
+      b = a.read_value()
+
+    other_assign = a.assign(3.0)
+    with tf.control_dependencies([other_assign]):
+      tf.Print(b, [b]).run()  # Will print 2.0 because the value was read before
+                              # other_assign ran.
+  ```
+
+  To enforce these consistency properties tf.ResourceVariable might make more
+  copies than an equivalent tf.Variable under the hood, so tf.Variable is still
+  not deprecated.
   """
 
   def __init__(self,

From 51441302d45c50750c42727af7ecc9bc709ae0f3 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 18 Aug 2017 12:24:39 -0700
Subject: [PATCH 27/70] Internal change.

PiperOrigin-RevId: 165737455
---
 tensorflow/BUILD                       |  1 +
 tensorflow/contrib/BUILD               |  1 +
 tensorflow/contrib/eager/python/BUILD  | 31 ++++++++++++
 tensorflow/contrib/eager/python/tfe.py | 67 ++++++++++++++++++++++++++
 4 files changed, 100 insertions(+)
 create mode 100644 tensorflow/contrib/eager/python/BUILD
 create mode 100644 tensorflow/contrib/eager/python/tfe.py

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 5e19d597f02..585240ec898 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -278,6 +278,7 @@ filegroup(
         "//tensorflow/contrib/data/python/util:all_files",
         "//tensorflow/contrib/decision_trees/proto:all_files",
         "//tensorflow/contrib/distributions:all_files",
+        "//tensorflow/contrib/eager/python:all_files",
         "//tensorflow/contrib/factorization:all_files",
         "//tensorflow/contrib/factorization/kernels:all_files",
         "//tensorflow/contrib/ffmpeg:all_files",
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 89e9072fa0d..0bfbdb81686 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -23,6 +23,7 @@ py_library(
         "//tensorflow/contrib/data",
         "//tensorflow/contrib/deprecated:deprecated_py",
         "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
new file mode 100644
index 00000000000..cdad3e6e348
--- /dev/null
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -0,0 +1,31 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+py_library(
+    name = "tfe",
+    srcs = ["tfe.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:core",
+        "//tensorflow/python/eager:custom_gradient",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "g3doc/sitemap.md",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
new file mode 100644
index 00000000000..1a8086cd510
--- /dev/null
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Eager execution prototype.
+
+EXPERIMENTAL: APIs here are unstable and likely to change without notice.
+
+To use, at program startup, call `tfe.enable_eager_execution()`.
+
+@@list_devices
+@@device
+
+
+@@defun
+@@implicit_gradients
+@@implicit_value_and_gradients
+@@gradients_function
+@@value_and_gradients_function
+
+@@enable_tracing
+@@flush_trace
+
+@@run
+@@enable_eager_execution
+
+@@custom_gradient
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# pylint:disable=g-bad-import-order,g-import-not-at-top,unused-import
+#
+from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager.custom_gradient import custom_gradient
+from tensorflow.python.eager import function
+from tensorflow.python.eager.context import context
+from tensorflow.python.eager.context import device
+from tensorflow.python.eager.context import enable_eager_execution
+from tensorflow.python.eager.context import run
+from tensorflow.python.eager.core import enable_tracing
+
+
+def list_devices():
+  return context().devices()
+
+defun = function.defun
+implicit_gradients = backprop.implicit_grad
+implicit_value_and_gradients = backprop.implicit_val_and_grad
+gradients_function = backprop.gradients_function
+value_and_gradients_function = backprop.val_and_grad_function
+
+remove_undocumented(__name__)

From 109ecf823d1ba2b9570102e09af87db4c6202b73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 13:24:55 -0700
Subject: [PATCH 28/70] Add support for complex in matrix_solve_ls_op. Split
 into separate files for each data type to speed up build.

PiperOrigin-RevId: 165744539
---
 .../kernels/matrix_solve_ls_op_complex128.cc  | 23 +++++++++++++++
 .../kernels/matrix_solve_ls_op_complex64.cc   | 23 +++++++++++++++
 .../core/kernels/matrix_solve_ls_op_double.cc | 23 +++++++++++++++
 .../core/kernels/matrix_solve_ls_op_float.cc  | 23 +++++++++++++++
 ...lve_ls_op.cc => matrix_solve_ls_op_impl.h} |  5 ----
 tensorflow/core/ops/linalg_ops.cc             | 29 ++++++++++---------
 .../kernel_tests/matrix_solve_ls_op_test.py   |  5 +++-
 7 files changed, 112 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/core/kernels/matrix_solve_ls_op_complex128.cc
 create mode 100644 tensorflow/core/kernels/matrix_solve_ls_op_complex64.cc
 create mode 100644 tensorflow/core/kernels/matrix_solve_ls_op_double.cc
 create mode 100644 tensorflow/core/kernels/matrix_solve_ls_op_float.cc
 rename tensorflow/core/kernels/{matrix_solve_ls_op.cc => matrix_solve_ls_op_impl.h} (96%)

diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_complex128.cc b/tensorflow/core/kernels/matrix_solve_ls_op_complex128.cc
new file mode 100644
index 00000000000..22274cc3daf
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_solve_ls_op_complex128.cc
@@ -0,0 +1,23 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+
+namespace tensorflow {
+
+REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<std::complex<double>>),
+                   complex128);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_complex64.cc b/tensorflow/core/kernels/matrix_solve_ls_op_complex64.cc
new file mode 100644
index 00000000000..c8421a3efba
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_solve_ls_op_complex64.cc
@@ -0,0 +1,23 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+
+namespace tensorflow {
+
+REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<std::complex<float>>),
+                   complex64);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_double.cc b/tensorflow/core/kernels/matrix_solve_ls_op_double.cc
new file mode 100644
index 00000000000..c7d03cb1052
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_solve_ls_op_double.cc
@@ -0,0 +1,23 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+
+namespace tensorflow {
+
+REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<double>), double);
+REGISTER_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<double>), double);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op_float.cc b/tensorflow/core/kernels/matrix_solve_ls_op_float.cc
new file mode 100644
index 00000000000..c98a84beded
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_solve_ls_op_float.cc
@@ -0,0 +1,23 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/matrix_solve_ls_op_impl.h"
+
+namespace tensorflow {
+
+REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<float>), float);
+REGISTER_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<float>), float);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op.cc b/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
similarity index 96%
rename from tensorflow/core/kernels/matrix_solve_ls_op.cc
rename to tensorflow/core/kernels/matrix_solve_ls_op_impl.h
index 381a5ec7b9d..0e09078365e 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_ls_op_impl.h
@@ -158,9 +158,4 @@ class MatrixSolveLsOp : public LinearAlgebraOp<Scalar> {
   bool fast_;
 };
 
-REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<float>), float);
-REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<double>), double);
-REGISTER_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<float>), float);
-REGISTER_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<double>), double);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 52f69f76a4f..5b75bda1f1b 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -422,7 +422,7 @@ REGISTER_OP("MatrixSolveLs")
     .Input("rhs: T")
     .Input("l2_regularizer: double")
     .Output("output: T")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, complex64, complex128}")
     .Attr("fast: bool = True")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle l2_regularizer;
@@ -433,28 +433,31 @@ REGISTER_OP("MatrixSolveLs")
 Solves one or more linear least-squares problems.
 
 `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+type as `matrix` and shape `[..., M, K]`.
 The output is a tensor shape `[..., N, K]` where each output matrix solves
-each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+each of the equations
+`matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
 in the least squares sense.
 
-matrix and right-hand sides in the batch:
+We use the following notation for (complex) matrix and right-hand sides
+in the batch:
 
-`matrix`=\\(A \in \Re^{m \times n}\\),
-`rhs`=\\(B  \in \Re^{m \times k}\\),
-`output`=\\(X  \in \Re^{n \times k}\\),
-`l2_regularizer`=\\(\lambda\\).
+`matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+`rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+`output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+`l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
 
 If `fast` is `True`, then the solution is computed by solving the normal
 equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-\\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
+\\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
 problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
 \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-\\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+\\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
 minimum-norm solution to the under-determined linear system, i.e.
-\\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
-\\(A Z = B\\). Notice that the fast path is only numerically stable when
-\\(A\\) is numerically full rank and has a condition number
+\\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+when \\(A\\) is numerically full rank and has a condition number
 \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
 sufficiently large.
 
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index 9a7645ff616..ece222fefc8 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -65,9 +65,12 @@ def BatchRegularizedLeastSquares(matrices, rhss, l2_regularization=0.0):
 class MatrixSolveLsOpTest(test.TestCase):
 
   def _verifySolve(self, x, y):
-    for np_type in [np.float32, np.float64]:
+    for np_type in [np.float32, np.float64, np.complex64, np.complex128]:
       a = x.astype(np_type)
       b = y.astype(np_type)
+      if np_type in [np.complex64, np.complex128]:
+        a.imag = a.real
+        b.imag = b.real
       np_ans, _, _, _ = np.linalg.lstsq(a, b)
       for fast in [True, False]:
         with self.test_session():

From 378463ae8990be5e015d9b3cd67d95a1b9334399 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 13:38:19 -0700
Subject: [PATCH 29/70] Make tf.eye accept Python integer shapes and avoid
 generating unnecessary shape handling ops. Clean up test and add tests with
 placeholders.

PiperOrigin-RevId: 165746090
---
 .../python/kernel_tests/linalg_ops_test.py    | 233 +++++-------------
 tensorflow/python/ops/linalg_ops.py           |  48 ++--
 2 files changed, 91 insertions(+), 190 deletions(-)

diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 2a562b6e2c5..0e096bbc09c 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -27,7 +27,14 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-def _random_pd_matrix(n, rng):
+def _AddTest(test_class, op_name, testcase_name, fn):
+  test_name = "_".join(["test", op_name, testcase_name])
+  if hasattr(test_class, test_name):
+    raise RuntimeError("Test %s defined more than once" % test_name)
+  setattr(test_class, test_name, fn)
+
+
+def _RandomPDMatrix(n, rng):
   """Random positive definite matrix."""
   temp = rng.randn(n, n)
   return temp.dot(temp.T)
@@ -44,8 +51,8 @@ class CholeskySolveTest(test.TestCase):
         for np_type, atol in [(np.float32, 0.05), (np.float64, 1e-5)]:
           # Create 2 x n x n matrix
           array = np.array(
-              [_random_pd_matrix(n, self.rng), _random_pd_matrix(n, self.rng)
-              ]).astype(np_type)
+              [_RandomPDMatrix(n, self.rng),
+               _RandomPDMatrix(n, self.rng)]).astype(np_type)
           chol = linalg_ops.cholesky(array)
           for k in range(1, 3):
             rhs = self.rng.randn(2, n, k).astype(np_type)
@@ -55,174 +62,58 @@ class CholeskySolveTest(test.TestCase):
 
 
 class EyeTest(test.TestCase):
-
-  def test_non_batch_2x2(self):
-    num_rows = 2
-    dtype = np.float32
-    np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows, dtype=dtype)
-      self.assertAllEqual((num_rows, num_rows), eye.get_shape())
-      self.assertAllEqual(np_eye, eye.eval())
-
-  def test_non_batch_2x3(self):
-    num_rows = 2
-    num_columns = 3
-    dtype = np.float32
-    np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype)
-      self.assertAllEqual((num_rows, num_columns), eye.get_shape())
-      self.assertAllEqual(np_eye, eye.eval())
-
-  def test_1x3_batch_4x4(self):
-    num_rows = 4
-    batch_shape = [1, 3]
-    dtype = np.float32
-    np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=dtype)
-      self.assertAllEqual(batch_shape + [num_rows, num_rows], eye.get_shape())
-      eye_v = eye.eval()
-      for i in range(batch_shape[0]):
-        for j in range(batch_shape[1]):
-          self.assertAllEqual(np_eye, eye_v[i, j, :, :])
-
-  def test_1x3_batch_4x4_dynamic(self):
-    num_rows = 4
-    batch_shape = [1, 3]
-    dtype = np.float32
-    np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session(use_gpu=True):
-      num_rows_ph = array_ops.placeholder(dtypes.int32)
-      batch_shape_ph = array_ops.placeholder(dtypes.int32)
-      eye = linalg_ops.eye(num_rows_ph, batch_shape=batch_shape_ph, dtype=dtype)
-      eye_v = eye.eval(
-          feed_dict={num_rows_ph: num_rows,
-                     batch_shape_ph: batch_shape})
-      for i in range(batch_shape[0]):
-        for j in range(batch_shape[1]):
-          self.assertAllEqual(np_eye, eye_v[i, j, :, :])
-
-  def test_1x3_batch_5x4(self):
-    num_rows = 5
-    num_columns = 4
-    batch_shape = [1, 3]
-    dtype = np.float32
-    np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows,
-                           num_columns=num_columns,
-                           batch_shape=batch_shape,
-                           dtype=dtype)
-      self.assertAllEqual(batch_shape + [num_rows, num_columns],
-                          eye.get_shape())
-      eye_v = eye.eval()
-      for i in range(batch_shape[0]):
-        for j in range(batch_shape[1]):
-          self.assertAllEqual(np_eye, eye_v[i, j, :, :])
-
-  def test_1x3_batch_5x4_dynamic(self):
-    num_rows = 5
-    num_columns = 4
-    batch_shape = [1, 3]
-    dtype = np.float32
-    np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session(use_gpu=True):
-      num_rows_ph = array_ops.placeholder(dtypes.int32)
-      num_columns_ph = array_ops.placeholder(dtypes.int32)
-      batch_shape_ph = array_ops.placeholder(dtypes.int32)
-      eye = linalg_ops.eye(num_rows_ph,
-                           num_columns=num_columns_ph,
-                           batch_shape=batch_shape_ph,
-                           dtype=dtype)
-      eye_v = eye.eval(feed_dict={
-          num_rows_ph: num_rows,
-          num_columns_ph: num_columns,
-          batch_shape_ph: batch_shape
-      })
-      for i in range(batch_shape[0]):
-        for j in range(batch_shape[1]):
-          self.assertAllEqual(np_eye, eye_v[i, j, :, :])
-
-  def test_non_batch_0x0(self):
-    num_rows = 0
-    dtype = np.int64
-    np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows, dtype=dtype)
-      self.assertAllEqual((num_rows, num_rows), eye.get_shape())
-      self.assertAllEqual(np_eye, eye.eval())
-
-  def test_non_batch_2x0(self):
-    num_rows = 2
-    num_columns = 0
-    dtype = np.int64
-    np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype)
-      self.assertAllEqual((num_rows, num_columns), eye.get_shape())
-      self.assertAllEqual(np_eye, eye.eval())
-
-  def test_non_batch_0x2(self):
-    num_rows = 0
-    num_columns = 2
-    dtype = np.int64
-    np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows, num_columns=num_columns, dtype=dtype)
-      self.assertAllEqual((num_rows, num_columns), eye.get_shape())
-      self.assertAllEqual(np_eye, eye.eval())
-
-  def test_1x3_batch_0x0(self):
-    num_rows = 0
-    batch_shape = [1, 3]
-    dtype = np.float32
-    np_eye = np.eye(num_rows).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows, batch_shape=batch_shape, dtype=dtype)
-      self.assertAllEqual((1, 3, 0, 0), eye.get_shape())
-      eye_v = eye.eval()
-      for i in range(batch_shape[0]):
-        for j in range(batch_shape[1]):
-          self.assertAllEqual(np_eye, eye_v[i, j, :, :])
-
-  def test_1x3_batch_2x0(self):
-    num_rows = 2
-    num_columns = 0
-    batch_shape = [1, 3]
-    dtype = np.float32
-    np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows,
-                           num_columns=num_columns,
-                           batch_shape=batch_shape,
-                           dtype=dtype)
-      self.assertAllEqual(batch_shape + [num_rows, num_columns],
-                          eye.get_shape())
-      eye_v = eye.eval()
-      for i in range(batch_shape[0]):
-        for j in range(batch_shape[1]):
-          self.assertAllEqual(np_eye, eye_v[i, j, :, :])
-
-  def test_1x3_batch_0x2(self):
-    num_rows = 0
-    num_columns = 2
-    batch_shape = [1, 3]
-    dtype = np.float32
-    np_eye = np.eye(num_rows, num_columns).astype(dtype)
-    with self.test_session(use_gpu=True):
-      eye = linalg_ops.eye(num_rows,
-                           num_columns=num_columns,
-                           batch_shape=batch_shape,
-                           dtype=dtype)
-      self.assertAllEqual(batch_shape + [num_rows, num_columns],
-                          eye.get_shape())
-      eye_v = eye.eval()
-      for i in range(batch_shape[0]):
-        for j in range(batch_shape[1]):
-          self.assertAllEqual(np_eye, eye_v[i, j, :, :])
+  pass  # Will be filled in below
 
 
-if __name__ == '__main__':
+def _GetEyeTest(num_rows, num_columns, batch_shape, dtype):
+
+  def Test(self):
+    eye_np = np.eye(num_rows, M=num_columns, dtype=dtype.as_numpy_dtype)
+    if batch_shape is not None:
+      eye_np = np.tile(eye_np, batch_shape + [1, 1])
+    for use_placeholder in False, True:
+      if use_placeholder and (num_columns is None or batch_shape is None):
+        return
+      with self.test_session(use_gpu=True) as sess:
+        if use_placeholder:
+          num_rows_placeholder = array_ops.placeholder(
+              dtypes.int32, name="num_rows")
+          num_columns_placeholder = array_ops.placeholder(
+              dtypes.int32, name="num_columns")
+          batch_shape_placeholder = array_ops.placeholder(
+              dtypes.int32, name="batch_shape")
+          eye = linalg_ops.eye(
+              num_rows_placeholder,
+              num_columns=num_columns_placeholder,
+              batch_shape=batch_shape_placeholder,
+              dtype=dtype)
+          eye_tf = sess.run(
+              eye,
+              feed_dict={
+                  num_rows_placeholder: num_rows,
+                  num_columns_placeholder: num_columns,
+                  batch_shape_placeholder: batch_shape
+              })
+        else:
+          eye_tf = linalg_ops.eye(
+              num_rows,
+              num_columns=num_columns,
+              batch_shape=batch_shape,
+              dtype=dtype).eval()
+        self.assertAllEqual(eye_np, eye_tf)
+
+  return Test
+
+
+if __name__ == "__main__":
+  for _num_rows in 0, 1, 2, 5:
+    for _num_columns in None, 0, 1, 2, 5:
+      for _batch_shape in None, [], [2], [2, 3]:
+        for _dtype in (dtypes.int32, dtypes.int64, dtypes.float32,
+                       dtypes.float64, dtypes.complex64, dtypes.complex128):
+          name = "dtype_%s_num_rows_%s_num_column_%s_batch_shape_%s_" % (
+              _dtype.name, _num_rows, _num_columns, _batch_shape)
+          _AddTest(EyeTest, "EyeTest", name,
+                   _GetEyeTest(_num_rows, _num_columns, _batch_shape, _dtype))
+
   test.main()
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index e7ac2ca8367..334898c44ac 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -25,11 +25,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
-# go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_linalg_ops import *
-
 # pylint: enable=wildcard-import
+from tensorflow.python.util import compat
 
 # Names below are lower_case.
 # pylint: disable=invalid-name
@@ -105,8 +104,9 @@ def eye(num_rows,
       in each batch matrix.
     num_columns: Optional non-negative `int32` scalar `Tensor` giving the number
       of columns in each batch matrix.  Defaults to `num_rows`.
-    batch_shape:  `int32` `Tensor`.  If provided, returned `Tensor` will have
-      leading batch dimensions of this shape.
+    batch_shape:  A list or tuple of Python integers or a 1-D `int32` `Tensor`.
+      If provided, the returned `Tensor` will have leading batch dimensions of
+      this shape.
     dtype:  The type of an element in the resulting `Tensor`
     name:  A name for this `Op`.  Defaults to "eye".
 
@@ -115,22 +115,32 @@ def eye(num_rows,
   """
   with ops.name_scope(
       name, default_name='eye', values=[num_rows, num_columns, batch_shape]):
-
+    is_square = num_columns is None
     batch_shape = [] if batch_shape is None else batch_shape
-    batch_shape = ops.convert_to_tensor(
-        batch_shape, name='shape', dtype=dtypes.int32)
-
-    if num_columns is None:
-      diag_size = num_rows
-    else:
+    num_columns = num_rows if num_columns is None else num_columns
+    if isinstance(num_rows, ops.Tensor) or isinstance(
+        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
+      batch_shape = ops.convert_to_tensor(
+          batch_shape, name='shape', dtype=dtypes.int32)
       diag_size = math_ops.minimum(num_rows, num_columns)
-    diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
-    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
+      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
+      if not is_square:
+        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
+    else:
+      if not isinstance(num_rows, compat.integral_types) or not isinstance(
+          num_columns, compat.integral_types):
+        raise TypeError(
+            'num_rows and num_columns must be positive integer values.')
+      batch_shape = [dim for dim in batch_shape]
+      is_square = num_rows == num_columns
+      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
+      if not is_square:
+        shape = batch_shape + [num_rows, num_columns]
 
-    if num_columns is None:
+    diag_ones = array_ops.ones(diag_shape, dtype=dtype)
+    if is_square:
       return array_ops.matrix_diag(diag_ones)
     else:
-      shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
       zero_matrix = array_ops.zeros(shape, dtype=dtype)
       return array_ops.matrix_set_diag(zero_matrix, diag_ones)
 
@@ -140,7 +150,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
 
   `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
   form `M`-by-`N` matrices. Rhs is a tensor of shape `[..., M, K]` whose
-  inner-most 2 dimensions form `M`-by-`K` matrices.   The computed output is a
+  inner-most 2 dimensions form `M`-by-`K` matrices.  The computed output is a
   `Tensor` of shape `[..., N, K]` whose inner-most 2 dimensions form `M`-by-`K`
   matrices that solve the equations
   `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]` in the least squares
@@ -389,9 +399,9 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
         result = math_ops.reduce_max(result, max_axis, keep_dims=True)
       else:
         # General p-norms (positive p only)
-        result = math_ops.pow(math_ops.reduce_sum(
-            math_ops.pow(result, ord), axis, keep_dims=True),
-                              1.0 / ord)
+        result = math_ops.pow(
+            math_ops.reduce_sum(
+                math_ops.pow(result, ord), axis, keep_dims=True), 1.0 / ord)
     if not keep_dims:
       result = array_ops.squeeze(result, axis)
     return result

From 64e54423bbffa4161ba2d85dc913e1973ae52cc6 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 18 Aug 2017 13:41:48 -0700
Subject: [PATCH 30/70] [tf.contrib.data] Fix nested dictionary handling in
 dataset elements.

Backports recent changes to the core version of the nest.py library.

Fixes #12372.

PiperOrigin-RevId: 165746517
---
 .../dataset_constructor_op_test.py            | 10 +++
 tensorflow/contrib/data/python/util/nest.py   | 44 ++++++++----
 .../contrib/data/python/util/nest_test.py     | 67 +++++++++++++++++++
 3 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
index 1de2f8e4da5..1297a031d19 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -231,6 +231,16 @@ class DatasetConstructorTest(test.TestCase):
                        dtypes.int64), dataset.output_types)
     self.assertEquals(([], ([], []), []), dataset.output_shapes)
 
+  def testNestedDict(self):
+    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
+    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
+    self.assertEquals(dtypes.int32, dataset.output_types["b"])
+    self.assertEquals([], dataset.output_shapes["a"]["aa"])
+    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
+    self.assertEquals([3], dataset.output_shapes["b"])
+
   def testNonSequenceNestedStructure(self):
     components = np.array([1, 2, 3])
 
diff --git a/tensorflow/contrib/data/python/util/nest.py b/tensorflow/contrib/data/python/util/nest.py
index 6e9d588d79a..5c4b64c8739 100644
--- a/tensorflow/contrib/data/python/util/nest.py
+++ b/tensorflow/contrib/data/python/util/nest.py
@@ -40,6 +40,14 @@ import six as _six
 from tensorflow.python.util.all_util import remove_undocumented
 
 
+def _sorted(dict_):
+  """Returns a sorted list of the dict keys, with error if keys not sortable."""
+  try:
+    return sorted(_six.iterkeys(dict_))
+  except TypeError:
+    raise TypeError("nest only supports dicts with sortable keys.")
+
+
 def _sequence_like(instance, args):
   """Converts the sequence `args` to the same type as `instance`.
 
@@ -51,9 +59,13 @@ def _sequence_like(instance, args):
     `args` with the type of `instance`.
   """
   if isinstance(instance, dict):
-    # This is a dict. Iterate over the keys in sorted order to make
-    # this deterministic.
-    return {k: v for k, v in zip(sorted(instance.keys()), args)}
+    # Pack dictionaries in a deterministic order by sorting the keys.
+    # Notice this means that we ignore the original order of `OrderedDict`
+    # instances. This is intentional, to avoid potential bugs caused by mixing
+    # ordered and plain dicts (e.g., flattening a dict but using a
+    # corresponding `OrderedDict` to pack it back).
+    result = dict(zip(_sorted(instance), args))
+    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
   elif (isinstance(instance, tuple) and
         hasattr(instance, "_fields") and
         isinstance(instance._fields, _collections.Sequence) and
@@ -65,16 +77,22 @@ def _sequence_like(instance, args):
     return type(instance)(args)
 
 
-def _elements_of(nest):
-  if isinstance(nest, dict):
-    # Iterate over dict keys in sorted order to make this deterministic.
-    return [v for _, v in sorted(nest.items())]
+def _yield_value(iterable):
+  if isinstance(iterable, dict):
+    # Iterate through dictionaries in a deterministic order by sorting the
+    # keys. Notice this means that we ignore the original order of `OrderedDict`
+    # instances. This is intentional, to avoid potential bugs caused by mixing
+    # ordered and plain dicts (e.g., flattening a dict but using a
+    # corresponding `OrderedDict` to pack it back).
+    for key in _sorted(iterable):
+      yield iterable[key]
   else:
-    return nest
+    for value in iterable:
+      yield value
 
 
 def _yield_flat_nest(nest):
-  for n in _elements_of(nest):
+  for n in _yield_value(nest):
     if is_sequence(n):
       for ni in _yield_flat_nest(n):
         yield ni
@@ -132,7 +150,7 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
           "structure has type %s, while second structure has type %s."
           % (type_nest1, type_nest2))
 
-    for n1, n2 in zip(_elements_of(nest1), _elements_of(nest2)):
+    for n1, n2 in zip(_yield_value(nest1), _yield_value(nest2)):
       _recursive_assert_same_structure(n1, n2, check_types)
 
 
@@ -181,7 +199,7 @@ def _packed_nest_with_indices(structure, flat, index):
       (assuming indexing starts from `index`).
   """
   packed = []
-  for s in structure:
+  for s in _yield_value(structure):
     if is_sequence(s):
       new_index, child = _packed_nest_with_indices(s, flat, index)
       packed.append(_sequence_like(s, child))
@@ -286,8 +304,8 @@ def map_structure(func, *structure, **check_types_dict):
 def _yield_flat_up_to(shallow_tree, input_tree):
   """Yields elements `input_tree` partially flattened up to `shallow_tree`."""
   if is_sequence(shallow_tree):
-    for shallow_branch, input_branch in zip(_elements_of(shallow_tree),
-                                            _elements_of(input_tree)):
+    for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
+                                            _yield_value(input_tree)):
       for input_leaf in _yield_flat_up_to(shallow_branch, input_branch):
         yield input_leaf
   else:
diff --git a/tensorflow/contrib/data/python/util/nest_test.py b/tensorflow/contrib/data/python/util/nest_test.py
index 5132881afb9..58a10445fc8 100644
--- a/tensorflow/contrib/data/python/util/nest_test.py
+++ b/tensorflow/contrib/data/python/util/nest_test.py
@@ -65,6 +65,73 @@ class NestTest(test.TestCase):
     with self.assertRaises(ValueError):
       nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
 
+  def testFlattenDictOrder(self):
+    """`flatten` orders dicts by key, including OrderedDicts."""
+    ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
+    plain = {"d": 3, "b": 1, "a": 0, "c": 2}
+    ordered_flat = nest.flatten(ordered)
+    plain_flat = nest.flatten(plain)
+    self.assertEqual([0, 1, 2, 3], ordered_flat)
+    self.assertEqual([0, 1, 2, 3], plain_flat)
+
+  def testPackDictOrder(self):
+    """Packing orders dicts by key, including OrderedDicts."""
+    ordered = collections.OrderedDict([("d", 0), ("b", 0), ("a", 0), ("c", 0)])
+    plain = {"d": 0, "b": 0, "a": 0, "c": 0}
+    seq = [0, 1, 2, 3]
+    ordered_reconstruction = nest.pack_sequence_as(ordered, seq)
+    plain_reconstruction = nest.pack_sequence_as(plain, seq)
+    self.assertEqual(
+        collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)]),
+        ordered_reconstruction)
+    self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
+
+  def testFlattenAndPack_withDicts(self):
+    # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
+    named_tuple = collections.namedtuple("A", ("b", "c"))
+    mess = (
+        "z",
+        named_tuple(3, 4),
+        {
+            "c": (
+                1,
+                collections.OrderedDict([
+                    ("b", 3),
+                    ("a", 2),
+                ]),
+            ),
+            "b": 5
+        },
+        17
+    )
+
+    flattened = nest.flatten(mess)
+    self.assertEqual(flattened, ["z", 3, 4, 5, 1, 2, 3, 17])
+
+    structure_of_mess = (
+        14,
+        named_tuple("a", True),
+        {
+            "c": (
+                0,
+                collections.OrderedDict([
+                    ("b", 9),
+                    ("a", 8),
+                ]),
+            ),
+            "b": 3
+        },
+        "hi everybody",
+    )
+
+    unflattened = nest.pack_sequence_as(structure_of_mess, flattened)
+    self.assertEqual(unflattened, mess)
+
+    # Check also that the OrderedDict was created, with the correct key order.
+    unflattened_ordered_dict = unflattened[2]["c"][1]
+    self.assertIsInstance(unflattened_ordered_dict, collections.OrderedDict)
+    self.assertEqual(list(unflattened_ordered_dict.keys()), ["b", "a"])
+
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertFalse(nest.is_sequence([1, 3, [4, 5]]))

From 57b0276cf90a5834213c21c8a715efd35ae29a25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 13:49:38 -0700
Subject: [PATCH 31/70] Update ops-related pbtxt files.

PiperOrigin-RevId: 165747467
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 38 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  4 +-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 6ff1a3fc038..6236c8e919a 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -12978,6 +12978,44 @@ op {
     }
   }
 }
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "MatrixTriangularSolve"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 87cdc30fb1b..b68d05a8e81 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -12296,6 +12296,8 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -12307,7 +12309,7 @@ op {
     }
   }
   summary: "Solves one or more linear least-squares problems."
-  description: "`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.\nThe output is a tensor shape `[..., N, K]` where each output matrix solves\neach of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]\nin the least squares sense.\n\nmatrix and right-hand sides in the batch:\n\n`matrix`=\\\\(A \\in \\Re^{m \\times n}\\\\),\n`rhs`=\\\\(B  \\in \\Re^{m \\times k}\\\\),\n`output`=\\\\(X  \\in \\Re^{n \\times k}\\\\),\n`l2_regularizer`=\\\\(\\lambda\\\\).\n\nIf `fast` is `True`, then the solution is computed by solving the normal\nequations using Cholesky decomposition. Specifically, if \\\\(m \\ge n\\\\) then\n\\\\(X = (A^T A + \\lambda I)^{-1} A^T B\\\\), which solves the least-squares\nproblem \\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k} } ||A Z - B||_F^2 +\n\\lambda ||Z||_F^2\\\\). If \\\\(m \\lt n\\\\) then `output` is computed as\n\\\\(X = A^T (A A^T + \\lambda I)^{-1} B\\\\), which (for \\\\(\\lambda = 0\\\\)) is the\nminimum-norm solution to the under-determined linear system, i.e.\n\\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k} } ||Z||_F^2 \\\\), subject to\n\\\\(A Z = B\\\\). Notice that the fast path is only numerically stable when\n\\\\(A\\\\) is numerically full rank and has a condition number\n\\\\(\\mathrm{cond}(A) \\lt \\frac{1}{\\sqrt{\\epsilon_{mach} } }\\\\) or\\\\(\\lambda\\\\) is\nsufficiently large.\n\nIf `fast` is `False` an algorithm based on the numerically robust complete\northogonal decomposition is used. This computes the minimum-norm\nleast-squares solution, even when \\\\(A\\\\) is rank deficient. This path is\ntypically 6-7 times slower than the fast path. If `fast` is `False` then\n`l2_regularizer` is ignored."
+  description: "`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same\ntype as `matrix` and shape `[..., M, K]`.\nThe output is a tensor shape `[..., N, K]` where each output matrix solves\neach of the equations\n`matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`\nin the least squares sense.\n\nWe use the following notation for (complex) matrix and right-hand sides\nin the batch:\n\n`matrix`=\\\\(A \\in \\mathbb{C}^{m \\times n}\\\\),\n`rhs`=\\\\(B  \\in \\mathbb{C}^{m \\times k}\\\\),\n`output`=\\\\(X  \\in \\mathbb{C}^{n \\times k}\\\\),\n`l2_regularizer`=\\\\(\\lambda \\in \\mathbb{R}\\\\).\n\nIf `fast` is `True`, then the solution is computed by solving the normal\nequations using Cholesky decomposition. Specifically, if \\\\(m \\ge n\\\\) then\n\\\\(X = (A^H A + \\lambda I)^{-1} A^H B\\\\), which solves the least-squares\nproblem \\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k} } ||A Z - B||_F^2 +\n\\lambda ||Z||_F^2\\\\). If \\\\(m \\lt n\\\\) then `output` is computed as\n\\\\(X = A^H (A A^H + \\lambda I)^{-1} B\\\\), which (for \\\\(\\lambda = 0\\\\)) is the\nminimum-norm solution to the under-determined linear system, i.e.\n\\\\(X = \\mathrm{argmin}_{Z \\in \\mathbb{C}^{n \\times k} } ||Z||_F^2 \\\\),\nsubject to \\\\(A Z = B\\\\). Notice that the fast path is only numerically stable\nwhen \\\\(A\\\\) is numerically full rank and has a condition number\n\\\\(\\mathrm{cond}(A) \\lt \\frac{1}{\\sqrt{\\epsilon_{mach} } }\\\\) or\\\\(\\lambda\\\\) is\nsufficiently large.\n\nIf `fast` is `False` an algorithm based on the numerically robust complete\northogonal decomposition is used. This computes the minimum-norm\nleast-squares solution, even when \\\\(A\\\\) is rank deficient. This path is\ntypically 6-7 times slower than the fast path. If `fast` is `False` then\n`l2_regularizer` is ignored."
 }
 op {
   name: "MatrixTriangularSolve"

From 65f87c967c994e861bd82aaa1fede9e4ef21117b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 13:55:36 -0700
Subject: [PATCH 32/70] Change device string in RecvNodeDescriptor in
 VirtualScheduler from const reference to const as the RecvNodeDescriptor (and
 cached_recv_nodes map) outlives device string from the NodeDef.

PiperOrigin-RevId: 165748244
---
 tensorflow/core/grappler/costs/virtual_scheduler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 88d5156b9e6..2bc4c11d688 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -62,7 +62,7 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 struct RecvNodeDescriptor {
   const NodeDef* node;
   const int port_num;
-  const string& device;
+  const string device;
 
   RecvNodeDescriptor(const NodeDef* node_, const int port_num_,
                      const string& device_)

From bbc0b847156d59c4dab707cff9c39a445ac2f5ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 13:56:37 -0700
Subject: [PATCH 33/70] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 165748384
---
 tensorflow/go/op/wrappers.go | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 32c229a7e3c..e464d98a152 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -17790,28 +17790,31 @@ func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 // Solves one or more linear least-squares problems.
 //
 // `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
 // The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
 // in the least squares sense.
 //
-// matrix and right-hand sides in the batch:
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
 //
-// `matrix`=\\(A \in \Re^{m \times n}\\),
-// `rhs`=\\(B  \in \Re^{m \times k}\\),
-// `output`=\\(X  \in \Re^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda\\).
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
 //
 // If `fast` is `True`, then the solution is computed by solving the normal
 // equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
 // problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
 // \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
 // minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
-// \\(A Z = B\\). Notice that the fast path is only numerically stable when
-// \\(A\\) is numerically full rank and has a condition number
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
 // \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
 // sufficiently large.
 //

From eebcc861ac0e720bfccf728a98036cceaae60841 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Fri, 18 Aug 2017 14:11:20 -0700
Subject: [PATCH 34/70] Fixed the race condition between multi eval step
 increments.

PiperOrigin-RevId: 165750595
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   |  3 +-
 tensorflow/python/training/evaluation.py      | 30 +++++++++++++++----
 tensorflow/python/training/evaluation_test.py | 29 ++++++++++++++++++
 3 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 3622dff29b9..f18b04ac0b7 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -95,7 +95,8 @@ def _increase_eval_step_op(iterations_per_loop):
   """
   eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
   # Estimator evaluate increases 1 by default. So, we increase the difference.
-  return state_ops.assign_add(eval_step, iterations_per_loop - 1)
+  return state_ops.assign_add(eval_step, iterations_per_loop - 1,
+                              use_locking=True)
 
 
 def _tpu_job(run_config):
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index bbaa3931c20..041afa46674 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -22,6 +22,7 @@ import time
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -58,6 +59,23 @@ def _get_or_create_eval_step():
     return counter
 
 
+def _get_latest_eval_step_value(update_ops):
+  """Gets the eval step `Tensor` value after running `update_ops`.
+
+  Args:
+    update_ops: A list of `Tensors` or a dictionary of names to `Tensors`,
+        which are run before reading the eval step value.
+
+  Returns:
+    A `Tensor` representing the value for the evaluation step.
+  """
+  if isinstance(update_ops, dict):
+    update_ops = list(update_ops.values())
+
+  with ops.control_dependencies(update_ops):
+    return array_ops.identity(_get_or_create_eval_step().read_value())
+
+
 class _StopAfterNEvalsHook(session_run_hook.SessionRunHook):
   """Run hook used by the evaluation routines to run the `eval_ops` N times."""
 
@@ -148,11 +166,7 @@ def _evaluate_once(checkpoint_path,
   hooks = hooks or []
 
   if eval_ops is not None:
-    update_eval_step = state_ops.assign_add(eval_step, 1)
-
-    for h in hooks:
-      if isinstance(h, _StopAfterNEvalsHook):
-        h._set_evals_completed_tensor(update_eval_step)  # pylint: disable=protected-access
+    update_eval_step = state_ops.assign_add(eval_step, 1, use_locking=True)
 
     if isinstance(eval_ops, dict):
       eval_ops['update_eval_step'] = update_eval_step
@@ -161,6 +175,12 @@ def _evaluate_once(checkpoint_path,
     else:
       eval_ops = [eval_ops, update_eval_step]
 
+    eval_step_value = _get_latest_eval_step_value(eval_ops)
+
+    for h in hooks:
+      if isinstance(h, _StopAfterNEvalsHook):
+        h._set_evals_completed_tensor(eval_step_value)  # pylint: disable=protected-access
+
   logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                          time.gmtime()))
 
diff --git a/tensorflow/python/training/evaluation_test.py b/tensorflow/python/training/evaluation_test.py
index c7ffd8c60b0..c08544dd743 100644
--- a/tensorflow/python/training/evaluation_test.py
+++ b/tensorflow/python/training/evaluation_test.py
@@ -154,6 +154,35 @@ class EvaluateOnceTest(test.TestCase):
         hooks=[evaluation._StopAfterNEvalsHook(num_evals),])
     self.assertEqual(final_ops_values['value'], num_evals + final_increment)
 
+  def testMultiEvalStepIncrements(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(), 'eval_ops_and_final_ops')
+
+    # Train a model for a single step to get a checkpoint.
+    self._train_model(checkpoint_dir, num_steps=1)
+    checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
+
+    # Create the model so we have something to restore.
+    inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
+    logistic_classifier(inputs)
+
+    num_evals = 6
+
+    my_var = local_variable(0.0, name='MyVar')
+    # In eval ops, we also increase the eval step one more time.
+    eval_ops = [state_ops.assign_add(my_var, 1.0),
+                state_ops.assign_add(
+                    evaluation._get_or_create_eval_step(), 1, use_locking=True)]
+    expect_eval_update_counts = num_evals // 2
+
+    final_ops = array_ops.identity(my_var)
+
+    final_ops_values = evaluation._evaluate_once(
+        checkpoint_path=checkpoint_path,
+        eval_ops=eval_ops,
+        final_ops={'value': final_ops},
+        hooks=[evaluation._StopAfterNEvalsHook(num_evals),])
+    self.assertEqual(final_ops_values['value'], expect_eval_update_counts)
+
   def testOnlyFinalOp(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), 'only_final_ops')
 

From cdc08afbb2a88144cae770c77a389415bffcdaa6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 14:13:35 -0700
Subject: [PATCH 35/70] Slightly relax numeric tolerance for sinlge precision
 tests of matrix_solve_ls (and tighten it for double precision).

PiperOrigin-RevId: 165750936
---
 tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index ece222fefc8..ec4effec1b7 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -87,7 +87,11 @@ class MatrixSolveLsOpTest(test.TestCase):
         self.assertAllClose(np_r_norm, tf_r_norm)
 
         # Check solution.
-        self.assertAllClose(np_ans, ans, atol=1e-5, rtol=1e-5)
+        if np_type == np.float32 or np_type == np.complex64:
+          tol = 5e-5
+        else:
+          tol = 1e-12
+        self.assertAllClose(np_ans, ans, atol=tol, rtol=tol)
 
   def _verifySolveBatch(self, x, y):
     # Since numpy.linalg.lsqr does not support batch solves, as opposed

From a0544b0b8e5bc730b63b9f00fde587a3bedc80b5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 14:30:07 -0700
Subject: [PATCH 36/70] Make TPU symbols more easily accessible from contrib.

PiperOrigin-RevId: 165753322
---
 tensorflow/contrib/BUILD           |  4 +-
 tensorflow/contrib/tpu/BUILD       | 88 +++++-------------------------
 tensorflow/contrib/tpu/__init__.py | 30 +++++++++-
 tensorflow/tools/pip_package/BUILD |  4 +-
 4 files changed, 44 insertions(+), 82 deletions(-)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 0bfbdb81686..f2a7cbe0c21 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -77,9 +77,7 @@ py_library(
         "//tensorflow/contrib/text:text_py",
         "//tensorflow/contrib/tfprof",
         "//tensorflow/contrib/timeseries",
-        "//tensorflow/contrib/tpu:tpu_estimator",
-        "//tensorflow/contrib/tpu:tpu_helper_library",
-        "//tensorflow/contrib/tpu:tpu_py",
+        "//tensorflow/contrib/tpu",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
     ],
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index e3c842bdb0e..e0c95927e01 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -38,9 +38,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":tpu",
-        ":tpu_feed",
         ":tpu_py",
-        ":training_loop",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -123,89 +121,32 @@ tf_custom_op_py_library(
     ],
 )
 
-py_library(
-    name = "tpu_helper_library",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu",
-        ":tpu_feed",
-        ":tpu_function",
-        ":tpu_py",
-        ":tpu_sharding",
-        ":training_loop",
-    ],
-)
-
-py_library(
-    name = "tpu_function",
-    srcs = ["python/tpu/tpu_function.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_feed",
-        "//tensorflow/python:util",
-    ],
-)
-
 py_library(
     name = "tpu",
     srcs = [
         "python/tpu/__init__.py",
         "python/tpu/tpu.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":profiler",
-        ":tpu_function",
-        ":tpu_py",
-        ":training_loop",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
-py_library(
-    name = "tpu_sharding",
-    srcs = ["python/tpu/tpu_sharding.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:framework",
-        "//tensorflow/python:tensor_shape",
-    ],
-)
-
-py_library(
-    name = "tpu_feed",
-    srcs = ["python/tpu/tpu_feed.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_py",
-        ":tpu_sharding",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-    ],
-)
-
-py_library(
-    name = "training_loop",
-    srcs = [
+        "python/tpu/tpu_feed.py",
+        "python/tpu/tpu_function.py",
         "python/tpu/tpu_optimizer.py",
+        "python/tpu/tpu_sharding.py",
         "python/tpu/training_loop.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":tpu_function",
+        ":profiler",
         ":tpu_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops/losses",
     ],
 )
 
@@ -214,7 +155,7 @@ tf_py_test(
     size = "small",
     srcs = ["python/tpu/tpu_sharding_test.py"],
     additional_deps = [
-        ":tpu_sharding",
+        ":tpu",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
     ],
@@ -225,8 +166,7 @@ tf_py_test(
     size = "small",
     srcs = ["python/tpu/tpu_infeed_test.py"],
     additional_deps = [
-        ":tpu_feed",
-        ":tpu_sharding",
+        ":tpu",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_test_lib",
     ],
@@ -237,7 +177,7 @@ tf_py_test(
     size = "small",
     srcs = ["python/tpu/tpu_function_test.py"],
     additional_deps = [
-        ":tpu_function",
+        ":tpu",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_test_lib",
     ],
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index 1abd55b56dc..06097fe3fd3 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -13,7 +13,30 @@
 # limitations under the License.
 # =============================================================================
 
-"""Ops related to Tensor Processing Units."""
+"""Ops related to Tensor Processing Units.
+
+@@cross_replica_sum
+@@infeed_dequeue
+@@infeed_dequeue_tuple
+@@outfeed_enqueue
+@@outfeed_enqueue_tuple
+
+@@initialize_system
+@@shutdown_system
+@@core
+@@outside_all_rewrites
+@@replicate
+@@shard
+@@batch_parallel
+@@rewrite
+
+@@CrossShardOptimizer
+
+@@InfeedQueue
+
+@@while_loop
+@@repeat
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,7 +45,10 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
-from tensorflow.contrib.tpu.python.tpu import *
+from tensorflow.contrib.tpu.python.tpu.tpu import *
+from tensorflow.contrib.tpu.python.tpu.tpu_feed import *
+from tensorflow.contrib.tpu.python.tpu.tpu_optimizer import *
+from tensorflow.contrib.tpu.python.tpu.training_loop import *
 # pylint: enable=wildcard-import,unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index ea25805de8c..ae93cf210be 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -166,9 +166,7 @@ sh_binary(
             "//tensorflow/contrib/tensor_forest:init_py",
             "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
             "//tensorflow/contrib/timeseries:timeseries_pip",
-            "//tensorflow/contrib/tpu:tpu_estimator",
-            "//tensorflow/contrib/tpu:tpu_helper_library",
-            "//tensorflow/contrib/tpu:tpu_py",
+            "//tensorflow/contrib/tpu",
             "//tensorflow/examples/tutorials/mnist:package",
             "//tensorflow/python:distributed_framework_test_lib",
             "//tensorflow/python:meta_graph_testdata",

From 5ead76420dee762a5f710fda6893075f1292d5d3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 15:11:24 -0700
Subject: [PATCH 37/70] Reduce XLA compile time by ~7% for a convolutional
 image model:

* Added CompactPointerSet<T>, which is optimized for set size <= 1.
* Changed expensive CHECKs to DCHECKS in buffer_assignment.cc
* Reserve space in DFS state array before starting DFS.
* Use unsigned arithmetic in DFS state maintenance.
* HloInstruction:
  - Moved frequently used fields to start for better cache locality.
  - Use InlinedVector instead of vector for operand array.
  - Use InlinedVector instead of vector for DFS stack.
* Pre-compute "is array" and "is tuple" for LogicalBuffer.
* PointsToSet:
  - Combine two ShapeTrees into one.
  - Use CompactPointerSet instead of std::set to hold sources.
  - Use CompactPointerSet instead of std::set to hold flattened buffers.
* ShapeTree: use unique_ptr instead of optional for shape storage
  (reduces size and destruction overhead).
* Add proper const qualifiers to some FlatSet iterator methods.

Co-author=jeff
PiperOrigin-RevId: 165759117
---
 .../xla/service/algebraic_simplifier.cc       |   2 +-
 .../compiler/xla/service/buffer_assignment.cc |   7 +-
 .../compiler/xla/service/buffer_assignment.h  |   2 +-
 .../compiler/xla/service/buffer_liveness.h    |   5 +-
 .../xla/service/buffer_liveness_test.cc       |  21 +-
 .../compiler/xla/service/copy_insertion.cc    | 144 ++++++------
 .../xla/service/copy_insertion_test.cc        |   2 +-
 .../xla/service/cpu/cpu_executable.cc         |   2 +-
 .../service/cpu/parallel_cpu_executable.cc    |   2 +-
 .../compiler/xla/service/dfs_hlo_visitor.h    |  25 ++-
 .../xla/service/gpu/gpu_executable.cc         |   4 +-
 .../compiler/xla/service/heap_simulator.cc    |   4 +-
 .../xla/service/heap_simulator_test.cc        |  11 +-
 .../compiler/xla/service/hlo_instruction.cc   |  24 +-
 .../compiler/xla/service/hlo_instruction.h    |  48 ++--
 .../xla/service/hlo_rematerialization.cc      |   2 +-
 .../compiler/xla/service/layout_assignment.cc |   9 +-
 .../compiler/xla/service/liveness_util.cc     |   2 +-
 .../compiler/xla/service/logical_buffer.cc    |   8 +
 .../compiler/xla/service/logical_buffer.h     |  20 +-
 .../compiler/xla/service/reshape_mover.cc     |   4 +-
 .../xla/service/tuple_points_to_analysis.cc   | 131 +++++------
 .../xla/service/tuple_points_to_analysis.h    |  50 ++++-
 .../service/tuple_points_to_analysis_test.cc  |  20 +-
 tensorflow/compiler/xla/shape_tree.h          |  39 ++--
 tensorflow/core/BUILD                         |   2 +
 tensorflow/core/lib/gtl/compactptrset.h       | 208 ++++++++++++++++++
 tensorflow/core/lib/gtl/compactptrset_test.cc | 104 +++++++++
 tensorflow/core/lib/gtl/flatset.h             |   4 +-
 29 files changed, 625 insertions(+), 281 deletions(-)
 create mode 100644 tensorflow/core/lib/gtl/compactptrset.h
 create mode 100644 tensorflow/core/lib/gtl/compactptrset_test.cc

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index c59413e69bc..cf944b4f127 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1101,7 +1101,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
     VLOG(4) << "  old user: " << user->ToString();
     CHECK_EQ(user->operand(reshape_or_broadcast_operand_index),
              reshape_or_broadcast);
-    std::vector<HloInstruction*> new_user_operands = user->operands();
+    auto new_user_operands = user->operands();
     new_user_operands[reshape_or_broadcast_operand_index] = operand;
     auto new_user = computation_->AddInstruction(user->CloneWithNewOperands(
         ShapeUtil::MakeShapeWithLayout(
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index d54799901ad..7fd88aba7a5 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1156,8 +1156,7 @@ void BufferAssigner::AddWhileSetToColocatedBufferSets(
         // predecessor set, and must be unambiguous.
         const PointsToSet& init_points_to =
             points_to_analysis.GetPointsToSet(instruction->operand(0));
-        const std::vector<const LogicalBuffer*>& init_buffers =
-            init_points_to.element(buffer->index());
+        const auto& init_buffers = init_points_to.element(buffer->index());
         CHECK_EQ(init_buffers.size(), 1);
         CHECK_GT(predecessor_set.count(init_buffers[0]), 0);
         predecessor_while_buffers.push_back(init_buffers[0]);
@@ -1220,8 +1219,8 @@ const LogicalBuffer* AddBufferToColocatedSet(
     std::vector<const LogicalBuffer*>* colocated_set) {
   // CopyInsertion ensures root points-to set is unambiguous and distinct.
   const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
-  CHECK(!points_to.IsAmbiguous());
-  CHECK(points_to.IsDistinct());
+  DCHECK(!points_to.IsAmbiguous());
+  DCHECK(points_to.IsDistinct());
   colocated_set->push_back(points_to.element(index)[0]);
   return colocated_set->back();
 }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 0cbd339dc01..688aff89125 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -306,7 +306,7 @@ class BufferAssignment {
 
   // Returns the set LogicalBuffers which may be the source of the value at the
   // given index and instruction.
-  const std::vector<const LogicalBuffer*>& GetSourceBuffers(
+  const PointsToSet::BufferList& GetSourceBuffers(
       const HloInstruction* instruction, const ShapeIndex& index) const {
     return GetPointsToSet(instruction).element(index);
   }
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index 076eae2c874..be3c26ec069 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -54,8 +54,7 @@ class BufferLiveness {
   bool MaybeLiveOut(const LogicalBuffer& buffer) const;
 
   // Returns the complete set of buffers that may be live out of the module.
-  const tensorflow::gtl::FlatSet<const LogicalBuffer*>& maybe_live_out_buffers()
-      const {
+  const PointsToSet::BufferSet& maybe_live_out_buffers() const {
     return maybe_live_out_buffers_;
   }
 
@@ -106,7 +105,7 @@ class BufferLiveness {
   tensorflow::gtl::FlatSet<const LogicalBuffer*> aliased_buffers_;
 
   // LogicalBuffers that may be live out of the entry computation.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers_;
+  PointsToSet::BufferSet maybe_live_out_buffers_;
 
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
 };
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index a5f7cc0aebe..7a102d65ce3 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -37,10 +37,9 @@ class BufferLivenessTest : public HloTestBase {
   const LogicalBuffer& GetBuffer(const BufferLiveness& liveness,
                                  const HloInstruction* instruction,
                                  const ShapeIndex& index) {
-    const std::vector<const LogicalBuffer*>& pointed_to =
-        liveness.points_to_analysis()
-            .GetPointsToSet(instruction)
-            .element(index);
+    const auto& pointed_to = liveness.points_to_analysis()
+                                 .GetPointsToSet(instruction)
+                                 .element(index);
     CHECK_EQ(1, pointed_to.size());
     CHECK_EQ(instruction, pointed_to[0]->instruction());
     CHECK(index == pointed_to[0]->index());
@@ -72,9 +71,9 @@ class BufferLivenessTest : public HloTestBase {
                               ShapeUtil::GetSubshape(b->shape(), index)));
     // Lookup PointsTo set for instructions 'a' and 'b'.
     auto& points_to_analysis = liveness.points_to_analysis();
-    const std::vector<const LogicalBuffer*>& points_to_a =
+    const auto& points_to_a =
         points_to_analysis.GetPointsToSet(a).element(index);
-    const std::vector<const LogicalBuffer*>& points_to_b =
+    const auto& points_to_b =
         points_to_analysis.GetPointsToSet(b).element(index);
     // Make sure PointsTo sets for 'a' and 'b' are unambiguous.
     EXPECT_EQ(1, points_to_a.size());
@@ -435,8 +434,9 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
   auto builder = HloComputation::Builder(TestName());
   // Create param0 Tuple.
   auto tuple_param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape(
-             {ShapeUtil::MakeShape(F32, {8}), ShapeUtil::MakeShape(S32, {4})}),
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {8}), ShapeUtil::MakeShape(S32, {4})}),
       "param0"));
   // Create independent computations for each tuple elememt.
 
@@ -498,8 +498,9 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   auto builder = HloComputation::Builder(TestName());
   // Create param0 Tuple.
   auto tuple_param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape(
-             {ShapeUtil::MakeShape(F32, {8}), ShapeUtil::MakeShape(F32, {8})}),
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {8}), ShapeUtil::MakeShape(F32, {8})}),
       "param0"));
   // Create dependent computations for each tuple elememt.
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index c47abe9c62a..628f729e0b4 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -187,27 +187,25 @@ Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
 
   // Multiple buffers within a parameter/constant may be live out, so collect
   // a set of indices at which to copy first.
-  points_to.ForEachElement(
-      [this](const ShapeIndex& index,
-             const std::vector<const LogicalBuffer*>& buffers) {
-        if (IsReadOnlyIndex(index)) {
-          return;
-        }
-        for (const LogicalBuffer* buffer : buffers) {
-          // pointee is the HloInstruction producing the buffer which may be
-          // liveout.
-          HloInstruction* pointee = buffer->instruction();
-          if (pointee->opcode() == HloOpcode::kParameter ||
-              pointee->opcode() == HloOpcode::kConstant) {
-            VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
-                    << " index: " << tensorflow::str_util::Join(index, ",")
-                    << " may be live out of computation: "
-                    << pointee->ToString();
-            RecordIndex(index);
-            break;
-          }
-        }
-      });
+  points_to.ForEachElement([this](const ShapeIndex& index,
+                                  const PointsToSet::BufferList& buffers) {
+    if (IsReadOnlyIndex(index)) {
+      return;
+    }
+    for (const LogicalBuffer* buffer : buffers) {
+      // pointee is the HloInstruction producing the buffer which may be
+      // liveout.
+      HloInstruction* pointee = buffer->instruction();
+      if (pointee->opcode() == HloOpcode::kParameter ||
+          pointee->opcode() == HloOpcode::kConstant) {
+        VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
+                << " index: " << tensorflow::str_util::Join(index, ",")
+                << " may be live out of computation: " << pointee->ToString();
+        RecordIndex(index);
+        break;
+      }
+    }
+  });
   return Status::OK();
 }
 
@@ -230,8 +228,7 @@ Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
       buffer_to_source_indices;
   points_to.ForEachElement(
       [this, &buffer_to_source_indices](
-          const ShapeIndex& index,
-          const std::vector<const LogicalBuffer*>& buffers) {
+          const ShapeIndex& index, const PointsToSet::BufferList& buffers) {
         if (buffers.size() > 1) {
           // Record ambiguous points-to set at 'index'.
           if (!indices_to_copy_.element(index)) {
@@ -285,7 +282,7 @@ Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
         }
         const auto& points_to_analysis = liveness.points_to_analysis();
         // Lookup buffers for 'instruction_' and 'other_instruction'.
-        const std::vector<const LogicalBuffer*> instruction_buffers =
+        const auto instruction_buffers =
             points_to_analysis.GetPointsToSet(instruction_).element(index);
         // If 'instruction_' has ambiguous points-to-set  at 'index', it would
         // have been recorded in a previous pass (and we would have returned
@@ -294,7 +291,7 @@ Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
         CHECK_EQ(1, instruction_buffers.size());
         const LogicalBuffer* instruction_buffer = instruction_buffers[0];
 
-        const std::vector<const LogicalBuffer*> other_instruction_buffers =
+        const auto other_instruction_buffers =
             points_to_analysis.GetPointsToSet(other_instruction).element(index);
         // Do not insert a copy if both instructions point at the same buffer.
         // This eliminates unnecessary copies of read-only tuple elements.
@@ -451,58 +448,57 @@ StatusOr<ShapeTree<HloInstruction*>> RevertReadOnlyIndicesForConstants(
   FlatSet<const LogicalBuffer*> buffer_set;
 
   ShapeTree<HloInstruction*> copy_overrides(init_hlo->shape());
-  points_to.ForEachElement(
-      [init_hlo, read_only_indices, shared_copies, &buffer_set,
-       &copy_overrides](const ShapeIndex& index,
-                        const std::vector<const LogicalBuffer*>& buffers) {
-        // Look for read-only entry parameters.
-        if (!read_only_indices->element(index)) {
-          return;
+  points_to.ForEachElement([init_hlo, read_only_indices, shared_copies,
+                            &buffer_set, &copy_overrides](
+                               const ShapeIndex& index,
+                               const PointsToSet::BufferList& buffers) {
+    // Look for read-only entry parameters.
+    if (!read_only_indices->element(index)) {
+      return;
+    }
+    for (const LogicalBuffer* buffer : buffers) {
+      HloInstruction* pointee = buffer->instruction();
+      const bool is_constant = pointee->opcode() == HloOpcode::kConstant;
+      if (!is_constant) {
+        continue;
+      }
+
+      // We have found an constant that is read-only in
+      // the while body. These buffers are managed by the caller, and cannot
+      // be aliased with HLO buffers. Revert this read-only index,
+      // to allow it to be copied.
+      *read_only_indices->mutable_element(index) = false;
+
+      // Optimization to allow multiple while loops that share the same
+      // read-only entry constants to share a single copy.
+      // Only unambiguous and distinct array-shaped buffers are allowed, to
+      // reduce code complexity. The shape of the entry parameter must be
+      // identical to the shape of the init_hlo at this index, to ensure
+      // there were no intervening bitcast or GTE instructions, which are
+      // also hard to handle.
+      const Shape& pointee_shape = pointee->shape();
+      const Shape& init_shape =
+          ShapeUtil::GetSubshape(init_hlo->shape(), index);
+      if (buffers.size() == 1 && ShapeUtil::IsArray(pointee_shape) &&
+          ShapeUtil::Equal(pointee_shape, init_shape) &&
+          buffer_set.count(buffer) < 1) {
+        HloInstruction** copy = &(*shared_copies)[pointee];
+        if (*copy == nullptr) {
+          *copy = pointee->parent()->AddInstruction(HloInstruction::CreateUnary(
+              pointee_shape, HloOpcode::kCopy, pointee));
         }
-        for (const LogicalBuffer* buffer : buffers) {
-          HloInstruction* pointee = buffer->instruction();
-          const bool is_constant = pointee->opcode() == HloOpcode::kConstant;
-          if (!is_constant) {
-            continue;
-          }
+        // Add the copy as an override.
+        *copy_overrides.mutable_element(index) = *copy;
+      }
 
-          // We have found an constant that is read-only in
-          // the while body. These buffers are managed by the caller, and cannot
-          // be aliased with HLO buffers. Revert this read-only index,
-          // to allow it to be copied.
-          *read_only_indices->mutable_element(index) = false;
+      // Tracks whether this current buffer is distinct.
+      buffer_set.insert(buffer);
 
-          // Optimization to allow multiple while loops that share the same
-          // read-only entry constants to share a single copy.
-          // Only unambiguous and distinct array-shaped buffers are allowed, to
-          // reduce code complexity. The shape of the entry parameter must be
-          // identical to the shape of the init_hlo at this index, to ensure
-          // there were no intervening bitcast or GTE instructions, which are
-          // also hard to handle.
-          const Shape& pointee_shape = pointee->shape();
-          const Shape& init_shape =
-              ShapeUtil::GetSubshape(init_hlo->shape(), index);
-          if (buffers.size() == 1 && ShapeUtil::IsArray(pointee_shape) &&
-              ShapeUtil::Equal(pointee_shape, init_shape) &&
-              buffer_set.count(buffer) < 1) {
-            HloInstruction** copy = &(*shared_copies)[pointee];
-            if (*copy == nullptr) {
-              *copy =
-                  pointee->parent()->AddInstruction(HloInstruction::CreateUnary(
-                      pointee_shape, HloOpcode::kCopy, pointee));
-            }
-            // Add the copy as an override.
-            *copy_overrides.mutable_element(index) = *copy;
-          }
-
-          // Tracks whether this current buffer is distinct.
-          buffer_set.insert(buffer);
-
-          // We've already reverted the read-only index and handled the
-          // single-copy optimization above, so there's nothing more to do.
-          break;
-        }
-      });
+      // We've already reverted the read-only index and handled the
+      // single-copy optimization above, so there's nothing more to do.
+      break;
+    }
+  });
   return copy_overrides;
 }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 026be75757a..daaf8d10bb8 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -53,7 +53,7 @@ class CopyInsertionTest : public HloTestBase {
     EXPECT_TRUE(points_to.IsDistinct());
     EXPECT_TRUE(!points_to.IsAmbiguous());
 
-    tensorflow::gtl::FlatSet<const LogicalBuffer*> maybe_live_out_buffers =
+    auto maybe_live_out_buffers =
         points_to_analysis
             ->GetPointsToSet(module->entry_computation()->root_instruction())
             .CreateFlattenedSet();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index bf35bfbdd5c..6cc1d65c7af 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -316,7 +316,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
               [&buffers, &buffers_in_result, &result_buffer, this](
                   const ShapeIndex& index, size_t* buffer_entry) {
                 if (ShapeUtil::IsLeafIndex(result_buffer->shape(), index)) {
-                  const std::vector<const LogicalBuffer*>& sources =
+                  const auto& sources =
                       this->GetRootPointsToSet().element(index);
                   // The points to set is unambiguous so the set should be a
                   // singleton.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 71632e7c663..bef4ecd480d 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -566,7 +566,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
               [&buffers, &buffers_in_result, &result_buffer, this](
                   const ShapeIndex& index, size_t* buffer_entry) {
                 if (ShapeUtil::IsLeafIndex(result_buffer->shape(), index)) {
-                  const std::vector<const LogicalBuffer*>& sources =
+                  const auto& sources =
                       this->GetRootPointsToSet().element(index);
                   // The points to set is unambiguous so the set should be a
                   // singleton.
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 4baa56658f7..a1a3a882c7a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -248,6 +248,11 @@ class DfsHloVisitor {
   VisitState GetVisitState(int id) { return visit_state_.GetState(id); }
   VisitState GetVisitState(const HloInstruction& instruction);
 
+  // Resize internal state if necessary to hold state for ids <= num.
+  // This call is purely a performance hint and can be omitted without
+  // affecting correctness.
+  void ReserveVisitStates(int num) { visit_state_.Reserve(num); }
+
   void SetVisitState(int id, VisitState state) {
     visit_state_.SetState(id, state);
   }
@@ -301,35 +306,35 @@ class DfsHloVisitor {
  private:
   class DFSVisitStates {
    public:
-    DFSVisitStates() {
-      // Avoid frequent resizes of the visited bits array
-      states_.reserve(512);
+    DFSVisitStates() {}
+    void Reserve(uint64 num) {
+      states_.reserve((num + kStatesPerWord - 1) / kStatesPerWord);
     }
-    VisitState GetState(int id) {
-      int word_index = id / kStatesPerWord;
+    VisitState GetState(uint64 id) {
+      uint64 word_index = id / kStatesPerWord;
       if (word_index >= states_.size()) {
         return VisitState::kNotVisited;
       }
       static_assert(static_cast<int>(VisitState::kVisited) < 3,
                     "VisitState must fit in two bits");
       uint64 w = states_[word_index];
-      int shift = 2 * (id % kStatesPerWord);  // 2 bits per state
+      uint32 shift = 2 * (id % kStatesPerWord);  // 2 bits per state
       return static_cast<VisitState>((w >> shift) & 0x3);
     }
-    void SetState(int id, VisitState state) {
-      int word_index = id / kStatesPerWord;
+    void SetState(uint64 id, VisitState state) {
+      uint64 word_index = id / kStatesPerWord;
       if (word_index >= states_.size()) {
         states_.resize(word_index + 1, 0);
       }
       uint64* w = &states_[word_index];
-      int shift = 2 * (id % kStatesPerWord);  // 2 bits per state
+      uint32 shift = 2 * (id % kStatesPerWord);  // 2 bits per state
       uint64 mask = 0x3ull << shift;
       *w = (*w & ~mask) | (static_cast<uint64>(state) << shift);
       DCHECK_EQ(GetState(id), state);
     }
 
    private:
-    static const int kStatesPerWord = sizeof(uint64) / 2 /*bits per entry*/;
+    static const uint32 kStatesPerWord = sizeof(uint64) / 2 /*bits per entry*/;
     // Map from id to two-bit states.  We store 32 such states per 64-bit
     // value
     std::vector<uint64> states_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index a195d0b8e4f..db7f9826d79 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -232,7 +232,7 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
       TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElementWithStatus(
           [&referred_by_output, &buffer_allocations, this](
               const ShapeIndex& /*index*/,
-              const std::vector<const LogicalBuffer*>& buffers) {
+              const PointsToSet::BufferList& buffers) {
             // The points to set is unambiguous so the set should be a
             // singleton. That is, we know exactly which instruction produced
             // the array at this element.
@@ -311,7 +311,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
               [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
                   const ShapeIndex& index, size_t* buffer_entry) {
                 if (ShapeUtil::IsLeafIndex(shaped_buffer->shape(), index)) {
-                  const std::vector<const LogicalBuffer*>& sources =
+                  const auto& sources =
                       this->GetRootPointsToSet().element(index);
                   // The points to set is unambiguous so the set should be a
                   // singleton. That is, we know exactly which instruction
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 4d6c4a54ca7..c85e97b691c 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -39,7 +39,7 @@ std::vector<const LogicalBuffer*> UniqueOperandSourceBuffers(
   for (const HloInstruction* operand : instruction->operands()) {
     points_to_analysis.GetPointsToSet(operand).ForEachElement(
         [&](const ShapeIndex& /*index*/,
-            const std::vector<const LogicalBuffer*>& points_to) {
+            const PointsToSet::BufferList& points_to) {
           buffers.insert(buffers.end(), points_to.begin(), points_to.end());
         });
   }
@@ -107,7 +107,7 @@ Status HeapSimulator::RunComputation(
   FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
 
   const HloInstruction* root = computation.root_instruction();
-  FlatSet<const LogicalBuffer*> output_source_buffers =
+  auto output_source_buffers =
       points_to_analysis.GetPointsToSet(root).CreateFlattenedSet();
 
   std::vector<const LogicalBuffer*> dead_buffers_to_free;
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index ef9db8ba236..17b926c8748 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -481,7 +481,7 @@ TEST_F(HeapSimulatorTest, WholeModule) {
 // Base class for heap algorithm tests.
 class HeapAlgorithmTestBase : public ::testing::Test {
  protected:
-  HeapAlgorithmTestBase() {
+  HeapAlgorithmTestBase() : builder_("heap_simulator_test") {
     buffer_a_ = DummyLogicalBuffer();
     buffer_b_ = DummyLogicalBuffer();
     buffer_c_ = DummyLogicalBuffer();
@@ -505,15 +505,16 @@ class HeapAlgorithmTestBase : public ::testing::Test {
   const LogicalBuffer* buffer_i_;
 
  private:
-  // Create a dummy LogicalBuffer to pass to the heap algorithm.  Since the
-  // algorithms only use the buffer as a handle, we don't need to fill in much
-  // other than the id and color.
+  // Create a dummy LogicalBuffer to pass to the heap algorithm.
   const LogicalBuffer* DummyLogicalBuffer() {
     const LogicalBuffer::Id id = buffers_.size();
-    buffers_.emplace_back(MakeUnique<LogicalBuffer>(nullptr, ShapeIndex{}, id));
+    auto const0 = builder_.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+    buffers_.emplace_back(MakeUnique<LogicalBuffer>(const0, ShapeIndex{}, id));
     return buffers_.back().get();
   }
 
+  HloComputation::Builder builder_;
   std::vector<std::unique_ptr<LogicalBuffer>> buffers_;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index fb9dbd64216..d99fd93fab8 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -743,7 +743,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     // If this is already a multioutput fusion instruction, expand the root
     // tuple by 1.
     HloInstruction* fused_root = fused_expression_root();
-    std::vector<HloInstruction*> tuple_elements;
+    HloInstruction::InstructionVector tuple_elements;
     bool newly_created_tuple_instr = false;
     if (fused_root->opcode() == HloOpcode::kTuple) {
       tuple_elements = fused_root->operands();
@@ -752,10 +752,9 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       newly_created_tuple_instr = true;
     }
     if (clone->opcode() == HloOpcode::kTuple) {
-      const auto& tuple_elements_to_fuse = clone->operands();
-      tuple_elements.insert(tuple_elements.end(),
-                            tuple_elements_to_fuse.begin(),
-                            tuple_elements_to_fuse.end());
+      for (auto inst : clone->operands()) {
+        tuple_elements.push_back(inst);
+      }
     } else {
       tuple_elements.push_back(clone);
     }
@@ -1964,8 +1963,8 @@ HloInstruction::fused_instructions() const {
 
 HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
     : unique_id_(-1),
-      shape_(shape),
       opcode_(opcode),
+      shape_(shape),
       name_("%" + HloOpcodeString(opcode)) {
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_));
 }
@@ -2118,12 +2117,13 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
                        HloOpcodeString(opcode_).c_str());
 }
 
+using DFSStack =
+    tensorflow::gtl::InlinedVector<std::pair<int, HloInstruction*>, 16>;
+
 // Push "child" onto the dfs_stack if not already visited.  Returns false if a
 // cycle was detected, and true otherwise.
-inline bool PushDFSChild(
-    DfsHloVisitor* visitor,
-    std::vector<std::pair<int, HloInstruction*>>* dfs_stack,
-    HloInstruction* child) {
+inline bool PushDFSChild(DfsHloVisitor* visitor, DFSStack* dfs_stack,
+                         HloInstruction* child) {
   const int id = child->unique_id();
   CHECK_GE(id, 0) << "instruction may not have a parent computation";
   switch (visitor->GetVisitState(id)) {
@@ -2146,13 +2146,15 @@ using InternalCompareFunction =
 static Status PostOrderDFS(HloInstruction* root, DfsHloVisitor* visitor,
                            const InternalCompareFunction* operand_order,
                            bool ignore_control_predecessors) {
+  visitor->ReserveVisitStates(root->GetModule()->NumUniqueInstructionIds());
+
   // dfs_stack holds pairs of <HloInstruction*->unique_id(), HloInstruction*>.
   //
   // We need to keep track of both the id and the instruction because
   // instructions can get deleted while they are on the stack, so we
   // can't always use the (potentiall dead) instruction object to grab
   // its id.
-  std::vector<std::pair<int, HloInstruction*>> dfs_stack;
+  DFSStack dfs_stack;
   dfs_stack.emplace_back(root->unique_id(), root);
 
   do {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index d246720b3cf..a97066a7854 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -336,7 +337,8 @@ class HloInstruction {
   int64 operand_count() const { return operands_.size(); }
 
   // Returns the vector of operands of this instruction.
-  const std::vector<HloInstruction*>& operands() const { return operands_; }
+  using InstructionVector = tensorflow::gtl::InlinedVector<HloInstruction*, 2>;
+  const InstructionVector& operands() const { return operands_; }
 
   // Returns the index of 'target' in the operands sequence.
   // Precondition: target must be an operand (or a fatal error will occur).
@@ -986,15 +988,34 @@ class HloInstruction {
 
   int unique_id_;  // Unique to this HloInstruction within a HloModule
 
+  // Opcode for this instruction.
+  HloOpcode opcode_;
+
+  // Instruction operands.
+  InstructionVector operands_;
+
+  // The set of control predecessors of this instruction.
+  std::vector<HloInstruction*> control_predecessors_;
+
+  // The users of this instruction. Users are HLOs where this instruction is an
+  // operand. The vector users_ and the set user_set_ contain identical
+  // members. The set enables fast membership testing and the vector enables
+  // fast, stable iteration.
+  std::vector<HloInstruction*> users_;
+  std::unordered_set<const HloInstruction*> user_set_;
+
+  // The set of control successors of this instruction.
+  std::vector<HloInstruction*> control_successors_;
+
+  // The computation in which this instruction is contained.
+  HloComputation* parent_ = nullptr;
+
   // Shape of outfeed request.
   Shape outfeed_shape_;
 
   // Result shape of this instruction.
   Shape shape_;
 
-  // Opcode for this instruction.
-  HloOpcode opcode_;
-
   // Literal, only present for kConstant.
   std::unique_ptr<Literal> literal_;
 
@@ -1060,22 +1081,6 @@ class HloInstruction {
   // Outfeed configuration information, only present for kOutfeed.
   string outfeed_config_;
 
-  // Instruction operands.
-  std::vector<HloInstruction*> operands_;
-
-  // The users of this instruction. Users are HLOs where this instruction is an
-  // operand. The vector users_ and the set user_set_ contain identical
-  // members. The set enables fast membership testing and the vector enables
-  // fast, stable iteration.
-  std::vector<HloInstruction*> users_;
-  std::unordered_set<const HloInstruction*> user_set_;
-
-  // The set of control predecessors of this instruction.
-  std::vector<HloInstruction*> control_predecessors_;
-
-  // The set of control successors of this instruction.
-  std::vector<HloInstruction*> control_successors_;
-
   // A trace instruction that consumes this instruction.
   //
   // Invariant: if trace_instruction_ != nullptr, trace_instruction has this as
@@ -1104,9 +1109,6 @@ class HloInstruction {
   // String identifier for instruction.
   string name_;
 
-  // The computation in which this instruction is contained.
-  HloComputation* parent_ = nullptr;
-
   // Metadata for debugging.
   OpMetadata metadata_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index a0e5bb7911b..278a1d7efad 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -493,7 +493,7 @@ MemoryUsageTracker::MemoryUsageTracker(
     const TuplePointsToAnalysis& points_to_analysis,
     const InstructionList& instruction_list)
     : computation_(computation), instruction_list_(instruction_list) {
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> live_out_set =
+  PointsToSet::BufferSet live_out_set =
       points_to_analysis.GetPointsToSet(computation_->root_instruction())
           .CreateFlattenedSet();
   tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferId>
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 5f493d9962e..497ef15fa23 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -628,9 +628,8 @@ Status CheckLayouts(
       const PointsToSet& points_to_set =
           points_to_analysis->GetPointsToSet(instruction.get());
       TF_RETURN_IF_ERROR(points_to_set.ForEachElementWithStatus(
-          [&instruction](
-              ShapeIndex index,
-              const std::vector<const LogicalBuffer*>& buffers) -> Status {
+          [&instruction](ShapeIndex index,
+                         const PointsToSet::BufferList& buffers) -> Status {
             if (ShapeUtil::IsLeafIndex(instruction->shape(), index)) {
               const Shape& instruction_subshape =
                   ShapeUtil::GetSubshape(instruction->shape(), index);
@@ -934,7 +933,7 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
   return points_to_set.ForEachElementWithStatus(
       [this, &shape_layout, constraints](
           const ShapeIndex& index,
-          const std::vector<const LogicalBuffer*>& buffers) -> Status {
+          const PointsToSet::BufferList& buffers) -> Status {
         if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index)) {
           for (const LogicalBuffer* buffer : buffers) {
             if (constraints->BufferLayout(*buffer) == nullptr &&
@@ -1076,7 +1075,7 @@ StatusOr<Layout> InferArrayLayout(
   TF_RET_CHECK(
       !points_to_analysis.InstructionDefinesBufferAtIndex(instruction, index));
 
-  const std::vector<const LogicalBuffer*>& source_buffers =
+  const auto& source_buffers =
       points_to_analysis.GetPointsToSet(instruction).element(index);
   TF_RET_CHECK(!source_buffers.empty());
 
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
index 5654fa917dc..317271dfdd6 100644
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ b/tensorflow/compiler/xla/service/liveness_util.cc
@@ -80,7 +80,7 @@ std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
     HloInstruction* instruction, const ShapeIndex& index,
     const TuplePointsToAnalysis& points_to_analysis) {
   std::vector<std::pair<HloInstruction*, int64>> uses;
-  const std::vector<const LogicalBuffer*>& points_to =
+  const PointsToSet::BufferList& points_to =
       points_to_analysis.GetPointsToSet(instruction).element(index);
   for (const LogicalBuffer* buffer : points_to) {
     for (const BufferAlias& alias :
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index 3e843b20299..68553bed121 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -26,6 +26,14 @@ limitations under the License.
 
 namespace xla {
 
+LogicalBuffer::LogicalBuffer(HloInstruction* instruction,
+                             const ShapeIndex& index, Id id)
+    : instruction_(instruction), id_(id), color_(kInvalidColor), index_(index) {
+  const auto& s = shape();
+  is_array_ = ShapeUtil::IsArray(s);
+  is_tuple_ = ShapeUtil::IsTuple(s);
+}
+
 string LogicalBuffer::ToString() const {
   return tensorflow::strings::StrCat(instruction_->name(), "[",
                                      tensorflow::str_util::Join(index_, ","),
diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h
index a9f66886120..67b205e289e 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.h
+++ b/tensorflow/compiler/xla/service/logical_buffer.h
@@ -97,11 +97,7 @@ class LogicalBuffer {
   using SizeFunction = std::function<int64(const LogicalBuffer&)>;
   using AlignmentFunction = std::function<int64(LogicalBuffer::Color)>;
 
-  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id)
-      : instruction_(instruction),
-        index_(index),
-        id_(id),
-        color_(kInvalidColor) {}
+  LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id);
 
   Id id() const { return id_; }
 
@@ -140,14 +136,14 @@ class LogicalBuffer {
   bool IsTopLevel() const { return index_.empty(); }
 
   // Whether this buffer contains a tuple.
-  bool IsTuple() const { return ShapeUtil::IsTuple(shape()); }
+  bool IsTuple() const { return is_tuple_; }
+
+  // Whether this buffer contains an array.
+  bool IsArray() const { return is_array_; }
 
   // operator< is required for std::set.
   bool operator<(const LogicalBuffer& other) const { return id_ < other.id_; }
 
-  // Whether this buffer contains an array.
-  bool IsArray() const { return ShapeUtil::IsArray(shape()); }
-
   string ToString() const;
   LogicalBufferProto ToProto(const SizeFunction& size_fn) const;
 
@@ -160,9 +156,11 @@ class LogicalBuffer {
 
  private:
   HloInstruction* instruction_;
-  ShapeIndex index_;
-  Id id_;
+  Id id_ : 62;
+  bool is_array_ : 1;
+  bool is_tuple_ : 1;
   Color color_;
+  ShapeIndex index_;
 
   // Similar to HLO constructs (HloInstruction, etc), pointers are used for
   // comparison to equality, so disable all copying.
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 1c648d58c7f..a480236cebd 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -129,7 +129,7 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
 // metadata.
 bool IsElementwiseOfEquivalentReshapesOrTransposes(
     const HloInstruction* instruction) {
-  const std::vector<HloInstruction*>& operands = instruction->operands();
+  const auto& operands = instruction->operands();
   HloInstruction* first_reshape_operand =
       FirstNonScalarAndNonTrivialReshapeOperand(instruction);
   // If there are no non-trivial reshapes or transposes, then there is nothing
@@ -216,7 +216,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
           << "\n\tnew elementwise shape: "
           << ShapeUtil::HumanString(new_elementwise_shape);
 
-  std::vector<HloInstruction*> operands = instruction->operands();
+  auto operands = instruction->operands();
   for (size_t i = 0; i < operands.size(); ++i) {
     // All scalar operands remain as-is, even if they're reshape or transpose,
     // to simplify handling wrt special scalar broadcast rules for ops like
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index e0ae508c68a..c60b5e4deaa 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -46,8 +46,7 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias) {
 bool PointsToSet::IsAmbiguous() const {
   bool ambiguous = false;
   ForEachElement(
-      [&ambiguous](const ShapeIndex& /*index*/,
-                   const std::vector<const LogicalBuffer*>& points_to) {
+      [&ambiguous](const ShapeIndex& /*index*/, const BufferList& points_to) {
         ambiguous |= points_to.size() > 1;
       });
   return ambiguous;
@@ -56,9 +55,8 @@ bool PointsToSet::IsAmbiguous() const {
 bool PointsToSet::IsDistinct() const {
   bool distinct = true;
   std::set<const LogicalBuffer*> all_points_to;
-  ForEachElement([&distinct, &all_points_to](
-                     const ShapeIndex& /*index*/,
-                     const std::vector<const LogicalBuffer*>& points_to) {
+  ForEachElement([&distinct, &all_points_to](const ShapeIndex& /*index*/,
+                                             const BufferList& points_to) {
     for (auto& buffer : points_to) {
       if (all_points_to.count(buffer) != 0) {
         distinct = false;
@@ -75,34 +73,31 @@ size_t PointsToSet::size() const {
   return CreateFlattenedSet().size();
 }
 
-tensorflow::gtl::FlatSet<const LogicalBuffer*> PointsToSet::CreateFlattenedSet()
-    const {
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> flat_set;
-  ForEachElement([&flat_set](const ShapeIndex& /*index*/,
-                             const std::vector<const LogicalBuffer*>& buffers) {
-    flat_set.insert(buffers.begin(), buffers.end());
-  });
+PointsToSet::BufferSet PointsToSet::CreateFlattenedSet() const {
+  BufferSet flat_set;
+  ForEachElement(
+      [&flat_set](const ShapeIndex& /*index*/, const BufferList& buffers) {
+        flat_set.insert(buffers.begin(), buffers.end());
+      });
   return flat_set;
 }
 
 bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
   bool found = false;
-  ForEachElement(
-      [&found, &buffer](
-          const ShapeIndex& /*index*/,
-          const std::vector<const LogicalBuffer*>& pointed_to_buffers) {
-        if (!found &&
-            std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
-                      &buffer) != pointed_to_buffers.end()) {
-          found = true;
-        }
-      });
+  ForEachElement([&found, &buffer](const ShapeIndex& /*index*/,
+                                   const BufferList& pointed_to_buffers) {
+    if (!found &&
+        std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
+                  &buffer) != pointed_to_buffers.end()) {
+      found = true;
+    }
+  });
   return found;
 }
 
 bool PointsToSet::ContainsBufferAtIndex(const LogicalBuffer& buffer,
                                         const ShapeIndex& index) const {
-  const std::vector<const LogicalBuffer*>& pointed_to_buffers = element(index);
+  const auto& pointed_to_buffers = element(index);
   return std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
                    &buffer) != pointed_to_buffers.end();
 }
@@ -115,14 +110,14 @@ void PointsToSet::AddPointedToBuffer(const LogicalBuffer& buffer,
   mutable_element(index)->push_back(&buffer);
 }
 
-const std::set<HloInstruction*>& PointsToSet::tuple_sources(
+const PointsToSet::SourceSet& PointsToSet::tuple_sources(
     const ShapeIndex& index) const {
-  return tuple_sources_.element(index);
+  return tree_.element(index).tuple_sources;
 }
 
 void PointsToSet::add_tuple_source(const ShapeIndex& index,
                                    HloInstruction* tuple) {
-  tuple_sources_.mutable_element(index)->insert(tuple);
+  tree_.mutable_element(index)->tuple_sources.insert(tuple);
 }
 
 /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
@@ -177,7 +172,7 @@ Status TuplePointsToAnalysis::PopulateDefinedBuffersAndAliases(
     points_to_set.ForEachElement(
         [this, &instruction](
             const ShapeIndex& index,
-            const std::vector<const LogicalBuffer*>& pointed_to_buffers) {
+            const PointsToSet::BufferList& pointed_to_buffers) {
           for (const LogicalBuffer* buffer : pointed_to_buffers) {
             PerBuffer(buffer->id())
                 ->buffer_aliases.emplace_back(instruction.get(), index);
@@ -205,7 +200,7 @@ Status TuplePointsToAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
   PointsToSet& points_to_set = CreateEmptyPointsToSet(hlo_instruction);
   points_to_set.ForEachMutableElement(
       [this, hlo_instruction](const ShapeIndex& index,
-                              std::vector<const LogicalBuffer*>* buffers) {
+                              PointsToSet::BufferList* buffers) {
         const LogicalBuffer& buffer = NewLogicalBuffer(hlo_instruction, index);
         buffers->push_back(&buffer);
       });
@@ -232,7 +227,7 @@ Status TuplePointsToAnalysis::HandleGetTupleElement(
   // operand to the points-to set for this GetTupleElement instruction.
   points_to_set.ForEachMutableElement(
       [&, this](const ShapeIndex& target_index,
-                std::vector<const LogicalBuffer*>* points_to) {
+                PointsToSet::BufferList* points_to) {
         // Construct an index into the operand by prepending element_index to
         // the index for the GetTupleElement instruction's points-to set.
         ShapeIndex src_index;
@@ -289,7 +284,7 @@ Status TuplePointsToAnalysis::HandleTuple(
     operand_points_to_set.ForEachElement(
         [&points_to_set, &operand_points_to_set, i](
             const ShapeIndex& src_index,
-            const std::vector<const LogicalBuffer*>& points_to) {
+            const PointsToSet::BufferList& points_to) {
           ShapeIndex target_index;
           target_index.push_back(i);
           for (auto element : src_index) {
@@ -324,7 +319,7 @@ Status TuplePointsToAnalysis::HandleSelect(HloInstruction* select,
   PointsToSet& points_to_set = CreateCopiedPointsToSet(select, on_true);
   const PointsToSet& false_points_to_set = *PerInst(on_false)->points_to_set;
   points_to_set.ForEachMutableElement(
-      [&](const ShapeIndex& index, std::vector<const LogicalBuffer*>* buffers) {
+      [&](const ShapeIndex& index, PointsToSet::BufferList* buffers) {
         for (const LogicalBuffer* false_buffer :
              false_points_to_set.element(index)) {
           points_to_set.AddPointedToBuffer(*false_buffer, index);
@@ -361,8 +356,7 @@ PointsToSet& TuplePointsToAnalysis::CreateEmptyPointsToSet(
 
 bool TuplePointsToAnalysis::InstructionDefinesBufferAtIndex(
     const HloInstruction* instruction, const ShapeIndex& index) const {
-  const std::vector<const LogicalBuffer*>& buffers =
-      GetPointsToSet(instruction).element(index);
+  const auto& buffers = GetPointsToSet(instruction).element(index);
   return (buffers.size() == 1 && buffers[0]->instruction() == instruction);
 }
 
@@ -398,8 +392,7 @@ const LogicalBuffer& TuplePointsToAnalysis::GetBuffer(
 
 StatusOr<const LogicalBuffer*> TuplePointsToAnalysis::GetBufferDefinedAt(
     const HloInstruction* instruction, const ShapeIndex& index) const {
-  const std::vector<const LogicalBuffer*>& buffers =
-      GetPointsToSet(instruction).element(index);
+  const auto& buffers = GetPointsToSet(instruction).element(index);
   if (buffers.size() != 1 || buffers[0]->instruction() != instruction) {
     return FailedPrecondition(
         "instruction %s does not define buffer at index {%s}",
@@ -424,27 +417,26 @@ Status TuplePointsToAnalysis::GatherBuffersDefinedByInstruction(
     const HloInstruction* instruction,
     TuplePointsToAnalysis::BufferDefinitionVector* buffers) {
   GetPointsToSet(instruction)
-      .ForEachElement(
-          [this, buffers, instruction](
-              const ShapeIndex& index,
-              const std::vector<const LogicalBuffer*>& source_buffers) {
-            // Add buffers which 'instruction' is the source of.
-            CHECK(!source_buffers.empty());
-            if (source_buffers.size() == 1 &&
-                source_buffers[0]->instruction() == instruction) {
-              // If this instruction is the source of this buffer the
-              // indices must match.
-              DCHECK(source_buffers[0]->index() == index);
-              buffers->push_back(source_buffers[0]);
-            } else {
-              // If the points-to set includes more than one buffer then
-              // necessarily this instruction did not produce the
-              // buffer.
-              for (const LogicalBuffer* source_buffer : source_buffers) {
-                DCHECK(source_buffer->instruction() != instruction);
-              }
-            }
-          });
+      .ForEachElement([this, buffers, instruction](
+                          const ShapeIndex& index,
+                          const PointsToSet::BufferList& source_buffers) {
+        // Add buffers which 'instruction' is the source of.
+        CHECK(!source_buffers.empty());
+        if (source_buffers.size() == 1 &&
+            source_buffers[0]->instruction() == instruction) {
+          // If this instruction is the source of this buffer the
+          // indices must match.
+          DCHECK(source_buffers[0]->index() == index);
+          buffers->push_back(source_buffers[0]);
+        } else {
+          // If the points-to set includes more than one buffer then
+          // necessarily this instruction did not produce the
+          // buffer.
+          for (const LogicalBuffer* source_buffer : source_buffers) {
+            DCHECK(source_buffer->instruction() != instruction);
+          }
+        }
+      });
   return Status::OK();
 }
 
@@ -456,7 +448,7 @@ PointsToSet& TuplePointsToAnalysis::CreateCopiedPointsToSet(
   const PointsToSet& src_points_to_set = GetPointsToSet(src);
   dst_points_to_set.ForEachMutableElement(
       [this, &dst_points_to_set, &src_points_to_set](
-          const ShapeIndex& index, std::vector<const LogicalBuffer*>* buffers) {
+          const ShapeIndex& index, PointsToSet::BufferList* buffers) {
         *buffers = src_points_to_set.element(index);
         for (auto& tuple_source : src_points_to_set.tuple_sources(index)) {
           dst_points_to_set.add_tuple_source(index, tuple_source);
@@ -505,19 +497,18 @@ void TuplePointsToAnalysis::InstructionToString(
   tensorflow::strings::StrAppend(output, prefix, "  instruction ",
                                  instruction->ToShortString(), ":\n");
   const PointsToSet& points_to_set = GetPointsToSet(instruction);
-  points_to_set.ForEachElement(
-      [&prefix, &output](const ShapeIndex& index,
-                         const std::vector<const LogicalBuffer*>& points_to) {
-        tensorflow::strings::StrAppend(
-            output, prefix, "    {", tensorflow::str_util::Join(index, ","),
-            "}: ",
-            tensorflow::str_util::Join(
-                points_to, ", ",
-                [](string* out, const LogicalBuffer* source) {
-                  out->append(source->ToString());
-                }),
-            "\n");
-      });
+  points_to_set.ForEachElement([&prefix, &output](
+                                   const ShapeIndex& index,
+                                   const PointsToSet::BufferList& points_to) {
+    tensorflow::strings::StrAppend(
+        output, prefix, "    {", tensorflow::str_util::Join(index, ","), "}: ",
+        tensorflow::str_util::Join(
+            points_to, ", ",
+            [](string* out, const LogicalBuffer* source) {
+              out->append(source->ToString());
+            }),
+        "\n");
+  });
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 3d69e77ab96..cb9778bc083 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/compactptrset.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/macros.h"
@@ -46,14 +47,12 @@ namespace xla {
 // nested tuple). Each node in this tree corresponds to a single buffer in the
 // instruction's output and contains the set of Buffers which might define
 // the corresponding buffer.
-class PointsToSet : public ShapeTree<std::vector<const LogicalBuffer*>> {
+class PointsToSet {
  public:
   // Construct our ShapeTree with a pointer rather than a reference to a Shape
   // because this is very hot code, and copying (and then destroying) all these
   // Shapes is slow.
-  explicit PointsToSet(const Shape* shape)
-      : ShapeTree<std::vector<const LogicalBuffer*>>(shape),
-        tuple_sources_(shape) {}
+  explicit PointsToSet(const Shape* shape) : tree_(shape) {}
 
   // Returns true if any points-to sets for any subshape element is not a
   // singleton.
@@ -69,7 +68,8 @@ class PointsToSet : public ShapeTree<std::vector<const LogicalBuffer*>> {
 
   // Creates a set containing the union of all LogicalBuffers contained in the
   // PointsToSet.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> CreateFlattenedSet() const;
+  using BufferSet = tensorflow::gtl::CompactPointerSet<const LogicalBuffer*>;
+  BufferSet CreateFlattenedSet() const;
 
   // Returns true if the given buffer is in the points-to set at the given
   // index.
@@ -102,13 +102,49 @@ class PointsToSet : public ShapeTree<std::vector<const LogicalBuffer*>> {
   // tuple_sources() at the index of an array shape (not a tuple) returns the
   // empty set. The instructions in the set returned by tuple_sources
   // necessarily are either Tuple instructions, constants, or parameters.
-  const std::set<HloInstruction*>& tuple_sources(const ShapeIndex& index) const;
+  using SourceSet = tensorflow::gtl::CompactPointerSet<HloInstruction*>;
+  const SourceSet& tuple_sources(const ShapeIndex& index) const;
 
   // Add a tuple source instruction for the given index.
   void add_tuple_source(const ShapeIndex& index, HloInstruction* tuple);
 
+  using BufferList = tensorflow::gtl::InlinedVector<const LogicalBuffer*, 1>;
+
+  // Return the list of logical buffers for the subshape at index.
+  const BufferList& element(const ShapeIndex& index) const {
+    return tree_.element(index).buffers;
+  }
+  BufferList* mutable_element(const ShapeIndex& index) {
+    return &tree_.mutable_element(index)->buffers;
+  }
+
+  // Call fn(index, buflist) for every subshape index.
+  template <typename Fn>
+  void ForEachElement(const Fn& fn) const {
+    tree_.ForEachElement([&fn](const ShapeIndex& index, const Elem& elem) {
+      fn(index, elem.buffers);
+    });
+  }
+  template <typename Fn>
+  void ForEachMutableElement(const Fn& fn) {
+    tree_.ForEachMutableElement([&fn](const ShapeIndex& index, Elem* elem) {
+      fn(index, &elem->buffers);
+    });
+  }
+  template <typename Fn>
+  Status ForEachElementWithStatus(const Fn& fn) const {
+    return tree_.ForEachElementWithStatus(
+        [&fn](const ShapeIndex& index, const Elem& elem) {
+          return fn(index, elem.buffers);
+        });
+  }
+
  private:
-  ShapeTree<std::set<HloInstruction*>> tuple_sources_;
+  struct Elem {
+    BufferList buffers;
+    SourceSet tuple_sources;
+  };
+  ShapeTree<Elem> tree_;
 
   // PointsToSet contains references (const LogicalBuffer*) to elements within
   // TuplePointsToAnalysis so disable copying.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index cd79e63cafc..5a23553d4ec 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -35,8 +35,8 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using ::testing::UnorderedElementsAreArray;
 using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedElementsAreArray;
 
 class TuplePointsToAnalysisTest : public HloTestBase {
  protected:
@@ -62,7 +62,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
   // index. CHECKs if no buffer is defined at that point.
   const LogicalBuffer* const GetBuffer(const HloInstruction* instruction,
                                        const ShapeIndex& index) {
-    const std::vector<const LogicalBuffer*>& pointed_to =
+    const auto& pointed_to =
         points_to_analysis_->GetPointsToSet(instruction).element(index);
     CHECK_EQ(1, pointed_to.size());
     CHECK_EQ(instruction, pointed_to[0]->instruction());
@@ -73,7 +73,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
   // Checks that the given points-to set contains exactly (unordered) the given
   // LogicalBuffers.
   void ExpectHasBuffers(
-      const std::vector<const LogicalBuffer*>& points_to_set,
+      const PointsToSet::BufferList& points_to_set,
       tensorflow::gtl::ArraySlice<const LogicalBuffer*> buffers) {
     std::vector<const LogicalBuffer*> vec(buffers.begin(), buffers.end());
     EXPECT_THAT(points_to_set, UnorderedElementsAreArray(vec));
@@ -82,22 +82,22 @@ class TuplePointsToAnalysisTest : public HloTestBase {
   // Checks that the given points-to set contains exactly (unordered) the
   // top-level buffers of the given instructions.
   void ExpectHasTopLevelBuffers(
-      const std::vector<const LogicalBuffer*>& points_to_set,
+      const PointsToSet::BufferList& points_to_set,
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
-    std::vector<const LogicalBuffer*> buffers;
+    PointsToSet::BufferList buffers;
     for (auto instruction : instructions) {
       buffers.push_back(GetBuffer(instruction, /*index=*/{}));
     }
     ExpectHasBuffers(points_to_set, buffers);
   }
 
-  // Overload which takes a std::set instead of a std::vector.
+  // Overload which takes a set instead of a vector.
   void ExpectHasTopLevelBuffers(
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& points_to_set,
+      const PointsToSet::BufferSet& points_to_set,
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
-    ExpectHasTopLevelBuffers(std::vector<const LogicalBuffer*>(
-                                 points_to_set.begin(), points_to_set.end()),
-                             instructions);
+    ExpectHasTopLevelBuffers(
+        PointsToSet::BufferList(points_to_set.begin(), points_to_set.end()),
+        instructions);
   }
 
   // Checks that the buffer defined at the given instruction and index has
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 8dbcd5ad2c8..64a36471b9f 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -115,25 +115,16 @@ class ShapeTree {
   ShapeTree(Shape shape, const T& init_value);
   ShapeTree(const Shape* shape, const T& init_value);
 
-  ShapeTree(const ShapeTree& other)
-      : root_(other.root_), shape_storage_(other.shape_storage_) {
-    // Fix up internal pointer if necessary.
-    if (shape_storage_) {
-      CHECK_EQ(other.shape_, &*other.shape_storage_);
-      shape_ = &*shape_storage_;
-    } else {
-      shape_ = other.shape_;
-    }
-  }
+  ShapeTree(const ShapeTree& other) { *this = other; }
 
   ShapeTree& operator=(const ShapeTree& other) {
     root_ = other.root_;
-    shape_storage_ = other.shape_storage_;
 
     // Fix up internal pointer if necessary.
-    if (shape_storage_) {
-      CHECK_EQ(other.shape_, &*other.shape_storage_);
-      shape_ = &*shape_storage_;
+    if (other.shape_storage_) {
+      CHECK_EQ(other.shape_, other.shape_storage_.get());
+      shape_storage_.reset(new Shape(*other.shape_));
+      shape_ = shape_storage_.get();
     } else {
       shape_ = other.shape_;
     }
@@ -259,11 +250,11 @@ class ShapeTree {
   Node root_;
 
   // If we own our Shape, this field contains it, and shape_ is a pointer into
-  // here.  Otherwise if we don't own our shape, this is nullopt.
-  tensorflow::gtl::optional<Shape> shape_storage_;
+  // here.  Otherwise if we don't own our shape, this is nullptr.
+  std::unique_ptr<Shape> shape_storage_;
 
-  // The XLA shape mirrored in this ShapeTree.  This is either a pointer into
-  // shape_storage_ or the Shape pointer passed to our constructor.
+  // The XLA shape mirrored in this ShapeTree.  This is either
+  // shape_storage_.get() or the Shape pointer passed to our constructor.
   const Shape* shape_;
 };
 
@@ -401,10 +392,12 @@ void ShapeTree<T>::InitChildren(const Shape& shape, Node* node) {
 
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape)
-    : root_(), shape_storage_(std::move(shape)), shape_(&*shape_storage_) {
+    : root_(),
+      shape_storage_(MakeUnique<Shape>(std::move(shape))),
+      shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
-  LayoutUtil::ClearLayout(&*shape_storage_);
+  LayoutUtil::ClearLayout(shape_storage_.get());
   InitChildren(*shape_, &root_);
 }
 
@@ -416,11 +409,11 @@ ShapeTree<T>::ShapeTree(const Shape* shape) : root_(), shape_(shape) {
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape, const T& init_value)
     : root_(init_value),
-      shape_storage_(std::move(shape)),
-      shape_(&*shape_storage_) {
+      shape_storage_(MakeUnique<Shape>(std::move(shape))),
+      shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
-  LayoutUtil::ClearLayout(&*shape_storage_);
+  LayoutUtil::ClearLayout(shape_storage_.get());
   InitChildren(*shape_, init_value, &root_);
 }
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 54f2ff7e132..1f7eb87f18b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -263,6 +263,7 @@ cc_library(
         "lib/core/threadpool.h",
         "lib/gtl/array_slice.h",
         "lib/gtl/cleanup.h",
+        "lib/gtl/compactptrset.h",
         "lib/gtl/flatmap.h",
         "lib/gtl/flatset.h",
         "lib/gtl/inlined_vector.h",
@@ -2092,6 +2093,7 @@ tf_cc_tests(
         "lib/core/threadpool_test.cc",
         "lib/gtl/array_slice_test.cc",
         "lib/gtl/cleanup_test.cc",
+        "lib/gtl/compactptrset_test.cc",
         "lib/gtl/edit_distance_test.cc",
         "lib/gtl/flatmap_test.cc",
         "lib/gtl/flatset_test.cc",
diff --git a/tensorflow/core/lib/gtl/compactptrset.h b/tensorflow/core/lib/gtl/compactptrset.h
new file mode 100644
index 00000000000..1d4d6cc8d2d
--- /dev/null
+++ b/tensorflow/core/lib/gtl/compactptrset.h
@@ -0,0 +1,208 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+
+#include <type_traits>
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace tensorflow {
+namespace gtl {
+
+// CompactPointerSet<T> is like a std::unordered_set<T> but is optimized
+// for small sets (<= 1 element).  T must be a pointer type.
+template <typename T>
+class CompactPointerSet {
+ private:
+  using BigRep = FlatSet<T>;
+
+ public:
+  using value_type = T;
+
+  CompactPointerSet() : rep_(0) {}
+
+  ~CompactPointerSet() {
+    static_assert(
+        std::is_pointer<T>::value,
+        "CompactPointerSet<T> can only be used with T's that are pointers");
+    if (isbig()) delete big();
+  }
+
+  CompactPointerSet(const CompactPointerSet& other) : rep_(0) { *this = other; }
+
+  CompactPointerSet& operator=(const CompactPointerSet& other) {
+    if (this == &other) return *this;
+    if (other.isbig()) {
+      // big => any
+      if (!isbig()) MakeBig();
+      *big() = *other.big();
+    } else if (isbig()) {
+      // !big => big
+      big()->clear();
+      if (other.rep_ != 0) {
+        big()->insert(reinterpret_cast<T>(other.rep_));
+      }
+    } else {
+      // !big => !big
+      rep_ = other.rep_;
+    }
+    return *this;
+  }
+
+  class iterator {
+   public:
+    typedef ssize_t difference_type;
+    typedef T value_type;
+    typedef const T* pointer;
+    typedef const T& reference;
+    typedef ::std::forward_iterator_tag iterator_category;
+
+    explicit iterator(uintptr_t rep)
+        : bigrep_(false), single_(reinterpret_cast<T>(rep)) {}
+    explicit iterator(typename BigRep::iterator iter)
+        : bigrep_(true), single_(nullptr), iter_(iter) {}
+
+    iterator& operator++() {
+      if (bigrep_) {
+        ++iter_;
+      } else {
+        DCHECK(single_ != nullptr);
+        single_ = nullptr;
+      }
+      return *this;
+    }
+    // maybe post-increment?
+
+    bool operator==(const iterator& other) const {
+      if (bigrep_) {
+        return iter_ == other.iter_;
+      } else {
+        return single_ == other.single_;
+      }
+    }
+    bool operator!=(const iterator& other) const { return !(*this == other); }
+
+    const T& operator*() const {
+      if (bigrep_) {
+        return *iter_;
+      } else {
+        DCHECK(single_ != nullptr);
+        return single_;
+      }
+    }
+
+   private:
+    friend class CompactPointerSet;
+    bool bigrep_;
+    T single_;
+    typename BigRep::iterator iter_;
+  };
+  using const_iterator = iterator;
+
+  bool empty() const { return isbig() ? big()->empty() : (rep_ == 0); }
+  size_t size() const { return isbig() ? big()->size() : (rep_ == 0 ? 0 : 1); }
+
+  void clear() {
+    if (isbig()) {
+      delete big();
+    }
+    rep_ = 0;
+  }
+
+  std::pair<iterator, bool> insert(T elem) {
+    if (!isbig()) {
+      if (rep_ == 0) {
+        uintptr_t v = reinterpret_cast<uintptr_t>(elem);
+        if (v == 0 || ((v & 0x3) != 0)) {
+          // Cannot use small representation for nullptr.  Fall through.
+        } else {
+          rep_ = v;
+          return {iterator(v), true};
+        }
+      }
+      MakeBig();
+    }
+    auto p = big()->insert(elem);
+    return {iterator(p.first), p.second};
+  }
+
+  template <typename InputIter>
+  void insert(InputIter begin, InputIter end) {
+    for (; begin != end; ++begin) {
+      insert(*begin);
+    }
+  }
+
+  const_iterator begin() const {
+    return isbig() ? iterator(big()->begin()) : iterator(rep_);
+  }
+  const_iterator end() const {
+    return isbig() ? iterator(big()->end()) : iterator(0);
+  }
+
+  iterator find(T elem) const {
+    if (rep_ == reinterpret_cast<uintptr_t>(elem)) {
+      return iterator(rep_);
+    } else if (!isbig()) {
+      return iterator(0);
+    } else {
+      return iterator(big()->find(elem));
+    }
+  }
+
+  size_t count(T elem) const { return find(elem) != end() ? 1 : 0; }
+
+  size_t erase(T elem) {
+    if (!isbig()) {
+      if (rep_ == reinterpret_cast<uintptr_t>(elem)) {
+        rep_ = 0;
+        return 1;
+      } else {
+        return 0;
+      }
+    } else {
+      return big()->erase(elem);
+    }
+  }
+
+ private:
+  // Size         rep_
+  // -------------------------------------------------------------------------
+  // 0            0
+  // 1            The pointer itself (bottom bits == 00)
+  // large        Pointer to a BigRep (bottom bits == 01)
+  uintptr_t rep_;
+
+  bool isbig() const { return (rep_ & 0x3) == 1; }
+  BigRep* big() const {
+    DCHECK(isbig());
+    return reinterpret_cast<BigRep*>(rep_ - 1);
+  }
+
+  void MakeBig() {
+    DCHECK(!isbig());
+    BigRep* big = new BigRep;
+    if (rep_ != 0) {
+      big->insert(reinterpret_cast<T>(rep_));
+    }
+    rep_ = reinterpret_cast<uintptr_t>(big) + 0x1;
+  }
+};
+
+}  // namespace gtl
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
diff --git a/tensorflow/core/lib/gtl/compactptrset_test.cc b/tensorflow/core/lib/gtl/compactptrset_test.cc
new file mode 100644
index 00000000000..26d01d2d495
--- /dev/null
+++ b/tensorflow/core/lib/gtl/compactptrset_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/gtl/compactptrset.h"
+
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gtl {
+namespace {
+
+typedef CompactPointerSet<const char*> StringSet;
+
+static std::vector<const char*> SortedContents(const StringSet& set) {
+  std::vector<const char*> contents(set.begin(), set.end());
+  std::sort(contents.begin(), contents.end());
+  return contents;
+}
+
+TEST(CompactPointerSetTest, Simple) {
+  // Make some aligned and some unaligned pointers.
+  string data = "ABCDEFG";
+  const char* a = &data[0];
+  const char* b = &data[1];
+  const char* c = &data[2];
+  const char* d = &data[3];
+  const char* e = &data[4];
+  const char* f = &data[5];
+  const char* g = &data[6];
+  for (const auto& list : std::vector<std::vector<const char*>>({{
+           {},                     // Empty
+           {a},                    // Aligned singleton
+           {b},                    // Unaligned singleton
+           {nullptr},              // Test insertion of nullptr
+           {a, b, c, d, e, f, g},  // Many
+       }})) {
+    LOG(INFO) << list.size();
+
+    // Test insert along with accessors.
+    StringSet set;
+    ASSERT_TRUE(set.empty());
+    for (auto p : list) {
+      ASSERT_EQ(set.count(p), 0);
+      ASSERT_TRUE(set.insert(p).second);
+      ASSERT_EQ(set.count(p), 1);
+      ASSERT_TRUE(set.find(p) != set.end());
+    }
+    ASSERT_EQ(set.size(), list.size());
+
+    ASSERT_EQ(SortedContents(set), list);
+
+    // Test copy constructor.
+    {
+      StringSet set2(set);
+      ASSERT_EQ(SortedContents(set2), list);
+    }
+
+    // Test assignment/copying into a destination with different
+    // initial elements.
+    for (const auto& initial : std::vector<std::vector<const char*>>({{
+             {},            // Empty
+             {a},           // Aligned singleton
+             {b},           // Unaligned singleton
+             {nullptr},     // Test insertion of nullptr
+             {a, b, c, d},  // Many
+         }})) {
+      StringSet dst;
+      for (auto p : initial) {
+        dst.insert(p);
+      }
+      ASSERT_EQ(dst.size(), initial.size());
+      dst = set;
+      ASSERT_EQ(SortedContents(dst), list);
+      dst.clear();
+      ASSERT_EQ(dst.size(), 0);
+    }
+
+    // Test erase along with accessors.
+    for (auto p : list) {
+      ASSERT_EQ(set.erase(p), 1);
+      ASSERT_EQ(set.erase(p), 0);
+    }
+    ASSERT_TRUE(set.empty());
+    ASSERT_EQ(set.size(), 0);
+  }
+}
+
+}  // namespace
+}  // namespace gtl
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
index 74940880da7..2b7f31ab224 100644
--- a/tensorflow/core/lib/gtl/flatset.h
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -108,8 +108,8 @@ class FlatSet {
     const_iterator(Bucket* b, Bucket* end, uint32 i)
         : b_(b), end_(end), i_(i) {}
 
-    reference operator*() { return key(); }
-    pointer operator->() { return &key(); }
+    reference operator*() const { return key(); }
+    pointer operator->() const { return &key(); }
     bool operator==(const const_iterator& x) const {
       return b_ == x.b_ && i_ == x.i_;
     }

From b6409594d39954c1794d14f6d4f6d9a5b40125b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 15:20:27 -0700
Subject: [PATCH 38/70] Convert some tests to cover both eager and graph.

PiperOrigin-RevId: 165760364
---
 tensorflow/python/framework/test_util.py | 72 ++++++++++++++++++++++++
 tensorflow/python/ops/nn_test.py         | 59 +++++++++----------
 2 files changed, 102 insertions(+), 29 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index e159cfa44bd..1f91a45c09d 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -46,6 +46,7 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -273,6 +274,58 @@ def enable_c_api(fn):
   return lambda *args, **kwargs: _use_c_api_wrapper(fn, True, *args, **kwargs)
 
 
+def run_in_graph_and_eager_modes(__unused__=None, graph=None, config=None,
+                                 use_gpu=False, force_gpu=False):
+  """Runs the test in both graph and eager modes.
+
+  Args:
+    __unused__: Prevents sliently skipping tests.
+    graph: Optional graph to use during the returned session.
+    config: An optional config_pb2.ConfigProto to use to configure the
+      session.
+    use_gpu: If True, attempt to run as many ops as possible on GPU.
+    force_gpu: If True, pin all ops to `/device:GPU:0`.
+
+  Returns:
+    Returns a decorator that will run the decorated test function
+        using both a graph and using eager execution.
+  """
+
+  assert not __unused__, "Add () after run_in_graph_and_eager_modes."
+
+  def decorator(f):
+    """Test method decorator."""
+    def decorated(self):
+      """Decorated the test method."""
+      with context.graph_mode():
+        with self.test_session(graph, config, use_gpu, force_gpu):
+          f(self)
+
+      def run_eager_mode():
+        if force_gpu:
+          gpu_name = gpu_device_name()
+          if not gpu_name:
+            gpu_name = "/device:GPU:0"
+          with context.device(gpu_name):
+            f(self)
+        elif use_gpu:
+          # TODO(xpan): Support softplacement and gpu by default when available.
+          f(self)
+        else:
+          with context.device("/device:CPU:0"):
+            f(self)
+
+      with context.eager_mode():
+        if graph is None:
+          run_eager_mode()
+        else:
+          with graph.as_default():
+            run_eager_mode()
+
+    return decorated
+  return decorator
+
+
 class TensorFlowTestCase(googletest.TestCase):
   """Base class for tests that need to test TensorFlow.
   """
@@ -386,6 +439,25 @@ class TensorFlowTestCase(googletest.TestCase):
       fail_msg += " : %r" % (msg) if msg else ""
       self.fail(fail_msg)
 
+  def evaluate(self, tensors):
+    """Evaluates tensors and returns numpy values.
+
+    Args:
+      tensors: A Tensor or a list of Tensors.
+
+    Returns:
+      tensors numpy values.
+    """
+    if context.in_eager_mode():
+      if isinstance(tensors, list):
+        assert all(isinstance(t, ops.EagerTensor) for t in tensors)
+        return [t.numpy() for t in tensors]
+      assert isinstance(tensors, ops.EagerTensor), "Must be list or EagerTensor"
+      return tensors.numpy()
+    else:
+      sess = ops.get_default_session()
+      return sess.run(tensors)
+
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
   def test_session(self,
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 22440b54d3a..809208e3a43 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -26,6 +26,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
@@ -74,16 +75,16 @@ class SoftmaxTest(test_lib.TestCase):
     z = u.sum(1)[:, np.newaxis]
     return u / z
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSoftmax(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._softmax(x_np)
-    with self.test_session():
-      x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.softmax(x_tf)
-      y_tf_last_dim = nn_ops.softmax(x_tf, 1)
-      y_tf_np = y_tf.eval()
-      y_tf_last_dim_np = y_tf_last_dim.eval()
+    x_tf = constant_op.constant(x_np)
+    y_tf = nn_ops.softmax(x_tf)
+    y_tf_last_dim = nn_ops.softmax(x_tf, 1)
+    y_tf_np = self.evaluate(y_tf)
+    y_tf_last_dim_np = self.evaluate(y_tf_last_dim)
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
     self.assertAllClose(y_tf_last_dim_np, y_np, eps)
@@ -109,18 +110,18 @@ class LogPoissonLossTest(test_lib.TestCase):
       lpl += np.ma.masked_array(stirling_approx, mask=(z <= 1)).filled(0.)
     return lpl
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogPoissonLoss(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float32)
     z_np = np.random.randint(0, 5, size=x_shape).astype(np.float32)
     y_np = self._log_poisson_loss(x_np, z_np, compute_full_loss=False)
     y_np_stirling = self._log_poisson_loss(x_np, z_np, compute_full_loss=True)
-    with self.test_session():
-      y_tf = nn_impl.log_poisson_loss(z_np, x_np, compute_full_loss=False)
-      y_tf_stirling = nn_impl.log_poisson_loss(
-          z_np, x_np, compute_full_loss=True)
-      y_tf_np = y_tf.eval()
-      y_tf_np_stirling = y_tf_stirling.eval()
+    y_tf = nn_impl.log_poisson_loss(z_np, x_np, compute_full_loss=False)
+    y_tf_stirling = nn_impl.log_poisson_loss(
+        z_np, x_np, compute_full_loss=True)
+    y_tf_np = self.evaluate(y_tf)
+    y_tf_np_stirling = self.evaluate(y_tf_stirling)
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
     self.assertAllClose(y_tf_np_stirling, y_np_stirling, eps)
@@ -151,14 +152,14 @@ class LogSoftmaxTest(test_lib.TestCase):
     u = x - m
     return u - np.log(np.sum(np.exp(u), 1, keepdims=True))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogSoftmax(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._log_softmax(x_np)
-    with self.test_session():
-      x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.log_softmax(x_tf)
-      y_tf_np = y_tf.eval()
+    x_tf = constant_op.constant(x_np)
+    y_tf = nn_ops.log_softmax(x_tf)
+    y_tf_np = self.evaluate(y_tf)
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
 
@@ -176,13 +177,13 @@ class LogSoftmaxTest(test_lib.TestCase):
 
 class L2LossTest(test_lib.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testL2Loss(self):
     for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        x = constant_op.constant(
-            [1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x", dtype=dtype)
-        l2loss = nn_ops.l2_loss(x)
-        value = l2loss.eval()
+      x = constant_op.constant(
+          [1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x", dtype=dtype)
+      l2loss = nn_ops.l2_loss(x)
+      value = self.evaluate(l2loss)
       self.assertAllClose(7.0, value)
 
   def testGradient(self):
@@ -210,27 +211,27 @@ class L2NormalizeTest(test_lib.TestCase):
       norm = np.apply_along_axis(np.linalg.norm, dim, x)
       return x / np.expand_dims(norm, dim)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testL2Normalize(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)
     x_np = np.random.random_sample(x_shape).astype(np.float32)
     for dim in range(len(x_shape)):
       y_np = self._l2Normalize(x_np, dim)
-      with self.test_session():
-        x_tf = constant_op.constant(x_np, name="x")
-        y_tf = nn_impl.l2_normalize(x_tf, dim)
-        self.assertAllClose(y_np, y_tf.eval())
+      x_tf = constant_op.constant(x_np, name="x")
+      y_tf = nn_impl.l2_normalize(x_tf, dim)
+      self.assertAllClose(y_np, self.evaluate(y_tf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testL2NormalizeDimArray(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)
     x_np = np.random.random_sample(x_shape).astype(np.float32)
     dim = [1, 2]
     y_np = self._l2Normalize(x_np, dim)
-    with self.test_session():
-      x_tf = constant_op.constant(x_np, name="x")
-      y_tf = nn_impl.l2_normalize(x_tf, dim)
-      self.assertAllClose(y_np, y_tf.eval())
+    x_tf = constant_op.constant(x_np, name="x")
+    y_tf = nn_impl.l2_normalize(x_tf, dim)
+    self.assertAllClose(y_np, self.evaluate(y_tf))
 
   def testL2NormalizeGradient(self):
     x_shape = [20, 7, 3]

From a271c37db3c2a0826f226facabe2bdd28177bc5d Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 18 Aug 2017 15:24:50 -0700
Subject: [PATCH 39/70] Small improvements to the arithmetic optimizer

PiperOrigin-RevId: 165760972
---
 tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc | 4 ++--
 tensorflow/core/grappler/utils.cc                           | 5 +++--
 tensorflow/python/grappler/memory_optimizer_test.py         | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index e7a6812872d..d6684d9e890 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -177,7 +177,7 @@ void ArithmeticOptimizer::DedupComputations(GraphDef* optimized_graph) const {
       if (rep == node) {
         continue;
       }
-      const std::set<NodeDef*> fanouts = map.GetOutputs(node->name());
+      const std::set<NodeDef*>& fanouts = map.GetOutputs(node->name());
       for (NodeDef* fanout : fanouts) {
         for (string& name : *fanout->mutable_input()) {
           int position;
@@ -190,7 +190,7 @@ void ArithmeticOptimizer::DedupComputations(GraphDef* optimized_graph) const {
             } else {
               name = strings::StrCat("^", rep->name());
             }
-            map.UpdateOutput(nodename, fanout->name(), name);
+            map.AddOutput(rep->name(), fanout->name());
           }
         }
       }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 714b0e54e9a..add50d8b14c 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -61,8 +61,9 @@ void NodeMap::AddOutput(const string& node, const string& output) {
 
 void NodeMap::UpdateOutput(const string& node, const string& old_output,
                            const string& new_output) {
-  outputs_[node].erase(nodes_[old_output]);
-  outputs_[node].insert(nodes_[new_output]);
+  std::set<NodeDef*>& outputs = outputs_[node];
+  outputs.erase(nodes_[old_output]);
+  outputs.insert(nodes_[new_output]);
 }
 
 bool IsSameInput(const string& name1, const string& name2) {
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 0e768c94e44..78f819e8c4f 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -125,6 +125,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     rewritten_graph_def = tf_optimizer.OptimizeGraph(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
+            arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS),
         original_metagraph)
     self.assertGreater(
@@ -146,6 +147,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     rewritten_graph_def = tf_optimizer.OptimizeGraph(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
+            arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS,
             memory_optimizer_target_node_name_prefix='optimizer/gradients/'),
         original_metagraph)

From 8e78e10eff52bd3f42dc9c4db361548144520596 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 15:43:53 -0700
Subject: [PATCH 40/70] disable test temporarily

PiperOrigin-RevId: 165763204
---
 tensorflow/python/kernel_tests/cholesky_op_test.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index de80fb30554..eb06e067a7f 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -183,11 +183,14 @@ class CholeskyGradTest(test.TestCase):
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
 
-  def testSmallMatricesComplex(self):
-    np.random.seed(0)
-    shapes = self.getShapes([1, 2, 10])
-    self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
+  # TODO(eriche): investigate why this test fails only in opensource
+  # ubuntu gpu python3
+
+  # def testSmallMatricesComplex(self):
+    # np.random.seed(0)
+    # shapes = self.getShapes([1, 2, 10])
+    # self.runFiniteDifferences(
+        # shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
   def testOneBlockMatrices(self):
     np.random.seed(0)

From d902babbd9597b8f66d84d2512be656acca2bb12 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Fri, 18 Aug 2017 16:05:06 -0700
Subject: [PATCH 41/70] [XLA] Algebraic simplifier incorrectly transformed
 convolutions into bitcasts

PiperOrigin-RevId: 165765575
---
 tensorflow/compiler/xla/service/algebraic_simplifier.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index cf944b4f127..59cf08be476 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1505,9 +1505,9 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
   // invalid.
-  if (!valid_bitcast_callback_(input_shape, lhs->shape()) ||
-      !valid_bitcast_callback_(new_filter_shape, rhs->shape()) ||
-      !valid_bitcast_callback_(convolution_shape, dot_output_shape)) {
+  if (!valid_bitcast_callback_(input_shape, new_input_shape) ||
+      !valid_bitcast_callback_(filter_shape, new_filter_shape) ||
+      !valid_bitcast_callback_(dot_output_shape, convolution_shape)) {
     return Status::OK();
   }
 

From d685bbc54d72a0956a364d8e98bc07f1e7f54d4f Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 18 Aug 2017 16:05:46 -0700
Subject: [PATCH 42/70] Benchmarks with backprop enabled (and removes
 overhead).

Before:
np.array([[3]])                          took 1.50us (30000 iterations)
Tensor([[3]])                            took 16.30us (30000 iterations)
MatMul [2, 2]: np.dot                         took 0.61us (30000 iterations)
MatMul [2, 2]: tf.matmul                      took 60.53us (30000 iterations)
MatMul [2, 2]: gen_math_ops.mat_mul           took 25.72us (30000 iterations)
MatMul [2, 2]: TFE_Py_Execute                 took 2.82us (30000 iterations)
MatMul [2, 2]: defun(tf.matmul)               took 45.70us (30000 iterations)
MatMul [100, 784]: np.dot                         took 383.32us (1000 iterations)
MatMul [100, 784]: tf.matmul                      took 350.35us (1000 iterations)
MatMul [100, 784]: gen_math_ops.mat_mul           took 315.97us (1000 iterations)
MatMul [100, 784]: TFE_Py_Execute                 took 249.42us (1000 iterations)
MatMul [100, 784]: defun(tf.matmul)               took 280.95us (1000 iterations)

If backprop is enabled:
np.array([[3]])                          took 0.83us (30000 iterations)
Tensor([[3]])                            took 15.21us (30000 iterations)
MatMul [2, 2]: np.dot                         took 0.63us (30000 iterations)
MatMul [2, 2]: tf.matmul                      took 76.31us (30000 iterations)
MatMul [2, 2]: gen_math_ops.mat_mul           took 38.66us (30000 iterations)
MatMul [2, 2]: TFE_Py_Execute                 took 2.31us (30000 iterations)
MatMul [2, 2]: defun(tf.matmul)               took 51.96us (30000 iterations)
MatMul [100, 784]: np.dot                         took 378.34us (1000 iterations)
MatMul [100, 784]: tf.matmul                      took 352.09us (1000 iterations)
MatMul [100, 784]: gen_math_ops.mat_mul           took 364.28us (1000 iterations)
MatMul [100, 784]: TFE_Py_Execute                 took 350.68us (1000 iterations)
MatMul [100, 784]: defun(tf.matmul)               took 377.19us (1000 iterations)

After:
np.array([[3]])                          took 0.86us (30000 iterations)
Tensor([[3]])                            took 15.19us (30000 iterations)
MatMul [2, 2]: np.dot                         took 0.60us (30000 iterations)
MatMul [2, 2]: tf.matmul                      took 64.51us (30000 iterations)
MatMul [2, 2]: gen_math_ops.mat_mul           took 28.34us (30000 iterations)
MatMul [2, 2]: TFE_Py_Execute                 took 2.38us (30000 iterations)
MatMul [2, 2]: defun(tf.matmul)               took 48.50us (30000 iterations)
MatMul [100, 784]: np.dot                         took 475.27us (1000 iterations)
MatMul [100, 784]: tf.matmul                      took 399.50us (1000 iterations)
MatMul [100, 784]: gen_math_ops.mat_mul           took 307.80us (1000 iterations)
MatMul [100, 784]: TFE_Py_Execute                 took 272.83us (1000 iterations)
MatMul [100, 784]: defun(tf.matmul)               took 350.06us (1000 iterations)
PiperOrigin-RevId: 165765641
---
 tensorflow/python/eager/BUILD              | 1 +
 tensorflow/python/eager/backprop.py        | 2 ++
 tensorflow/python/eager/benchmarks_test.py | 1 +
 3 files changed, 4 insertions(+)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index ec788ed3fa1..36524f8a369 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -307,6 +307,7 @@ py_test(
     srcs = ["benchmarks_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":backprop",
         ":context",
         ":function",
         ":tensor",
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index cf50bfc9f5e..3a70eaeaa54 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -134,6 +134,8 @@ def _record_gradient(op_name, inputs, attrs, results, name):
   Raises:
     An exception on error.
   """
+  if not any(ag_core.isnode(x) for x in inputs):
+    return results
   num_outputs = len(results)
   if num_outputs == 0:
     return results
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index e1c7b9da4f9..bda5103d146 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -33,6 +33,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import tensor

From 575bd01d463f68e18c5122b02d5431b1da60c51f Mon Sep 17 00:00:00 2001
From: Vijay Vasudevan <vrv@google.com>
Date: Fri, 18 Aug 2017 16:15:46 -0700
Subject: [PATCH 43/70] Remove /replica:0 declaration in device functions and
 allow them to be freely bound based on cluster names present.

When more than one value matches, it will choose the first
lexicographically available device that matches the specification,
which in practice will do pretty much the same thing as hardcoding
/replica:0.

PiperOrigin-RevId: 165766815
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index f18b04ac0b7..1d3ee649d4c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -881,7 +881,7 @@ class _EvalMetrics(object):
 
     num_shards = run_config.tpu_config.num_shards
     job = _tpu_job(run_config)
-    job_device = '' if job is None else ('/job:%s/replica:0' % job)
+    job_device = '' if job is None else ('/job:%s' % job)
 
     # For each i, dequeue_ops[i] is a list containing the tensors from all
     # shards. This list is concatenated later.
@@ -1144,7 +1144,7 @@ class TPUEstimator(estimator_lib.Estimator):
       if job is None:
         return '/replica:0/task:0/device:CPU:0'
       else:
-        return '/job:%s/replica:0/task:%d/device:CPU:0' % (job, index / 8)
+        return '/job:%s/task:%d/device:CPU:0' % (job, index / 8)
 
     if mode == model_fn_lib.ModeKeys.TRAIN:
       if not config.tpu_config.per_host_input_for_training:
@@ -1221,7 +1221,9 @@ def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder, run_config,
         if job is None:
           return '/replica:0/task:0/device:CPU:0'
         else:
-          return '/job:%s/replica:0/task:%d/device:CPU:0' % (job, index / 8)
+          # This assumes that if using more than 8 shards,
+          # the job configuration varies 'task'.
+          return '/job:%s/task:%d/device:CPU:0' % (job, index / 8)
       return infeed_queue.split_inputs_and_generate_enqueue_ops(
           unsharded_inputs, placement_function=placement_function)
 

From e9953f59971872a7bb86064342918ca841ce9567 Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Fri, 18 Aug 2017 16:54:33 -0700
Subject: [PATCH 44/70] Bugfix: import `vector_diffeomixture` lib in
 tf.contrib.distributions.__init__.py

PiperOrigin-RevId: 165770880
---
 tensorflow/contrib/distributions/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index dfded47b003..df956a5039c 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -50,6 +50,7 @@ from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
 from tensorflow.contrib.distributions.python.ops.test_util import *
+from tensorflow.contrib.distributions.python.ops.vector_diffeomixture import *
 from tensorflow.contrib.distributions.python.ops.vector_exponential_diag import *
 from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
 from tensorflow.contrib.distributions.python.ops.wishart import *

From a565e3333cdf5a432e4c0071f70dd52996db056d Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Fri, 18 Aug 2017 17:04:17 -0700
Subject: [PATCH 45/70] Added back tpu_estimator target to contrib

PiperOrigin-RevId: 165771868
---
 tensorflow/contrib/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index f2a7cbe0c21..167821a51a9 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -78,6 +78,7 @@ py_library(
         "//tensorflow/contrib/tfprof",
         "//tensorflow/contrib/timeseries",
         "//tensorflow/contrib/tpu",
+        "//tensorflow/contrib/tpu:tpu_estimator",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
     ],

From fce962f209085bcfaf7881cb705fab4459db0140 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 18 Aug 2017 17:10:26 -0700
Subject: [PATCH 46/70] ResourceVariables are compatible with implicit_grad.

PiperOrigin-RevId: 165772481
---
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/eager/BUILD                 |  2 +-
 tensorflow/python/eager/execute.py            |  6 ++-
 tensorflow/python/eager/tape.py               |  5 --
 .../python/kernel_tests/array_ops_test.py     |  1 -
 tensorflow/python/ops/array_ops.py            |  5 +-
 tensorflow/python/ops/nn_ops.py               |  6 ++-
 .../python/ops/resource_variable_ops.py       | 50 +++++++++++++++----
 8 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a0424822bc8..42fdd720533 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1726,6 +1726,7 @@ py_library(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:custom_gradient",
         "//tensorflow/python/eager:tensor",
     ],
 )
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 36524f8a369..e865848baa6 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -30,6 +30,7 @@ py_library(
         ":execute",
         ":function",
         ":graph_only_ops",
+        ":tape",
         ":tensor",
         ":test",
         "//tensorflow/python:pywrap_tensorflow",
@@ -82,7 +83,6 @@ py_library(
     deps = [
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 1aeb415f6ac..6452dd4d4d9 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -67,7 +67,11 @@ def execute(op_name, num_outputs, inputs, attrs=None, name=None):
                                             num_outputs)
     # pylint: enable=protected-access
   except core._NotOkStatusException as e:  # pylint: disable=protected-access
-    raise core._status_to_exception(e.code, e.message)  # pylint: disable=protected-access
+    if name is not None:
+      message = e.message + " name: " + name
+    else:
+      message = e.message
+    raise core._status_to_exception(e.code, message)  # pylint: disable=protected-access
   # pylint: enable=protected-access
 
   tensors = [tensor._tensor_from_handle(x) for x in outh]  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 5a0959a75e7..f2915eba59d 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -25,7 +25,6 @@ from autograd import core as ag_core
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 
@@ -144,10 +143,6 @@ def watch(tensor):
   Returns:
     The tensor, potentially wrapped by all tapes in the stack.
   """
-  if isinstance(tensor, resource_variable_ops.ResourceVariable):
-    tensor._handle = watch(tensor.handle)  # pylint: disable=protected-access
-    return tensor
-
   for t in _tape_stack.stack:
     tensor = _watch_with_tape(t, tensor)
   return tensor
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 7dc28b6aa99..11213bee3cf 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -988,7 +988,6 @@ class IdentityTest(test_util.TensorFlowTestCase):
         self.skipTest("No GPUs found")
 
       def _test(x, y, device):
-        self.assertIsNot(x, y)
         self.assertAllEqual(x.numpy(), y.numpy())
         self.assertTrue(device in y.device.lower())
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index ed472f3d8f7..0bfde675f22 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -124,8 +124,9 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   if context.in_graph_mode():
     return gen_array_ops.identity(input, name=name)
   else:
-    # TODO(apassos): make sure this works ok with gradients.
-    return input._copy()  # pylint: disable=protected-access
+    if context.context().device_name != input.device:
+      return input._copy()  # pylint: disable=protected-access
+    return input
 
 
 # pylint: disable=redefined-builtin,protected-access
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index c3fdbc81b55..467567f30e3 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -22,6 +22,7 @@ import numbers
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
@@ -1378,7 +1379,8 @@ def _flatten_outer_dims(logits):
         break
       else:
         product *= d
-    if product_valid:
+    # Only need to set shape if in graph mode
+    if product_valid and context.in_graph_mode():
       output_shape = [product, shape[-1]]
       output.set_shape(output_shape)
 
@@ -1603,7 +1605,7 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
 
   # Make shape inference work since reshape and transpose may erase its static
   # shape.
-  if shape is not None and shape.dims is not None:
+  if shape is not None and shape.dims is not None and context.in_graph_mode():
     shape = shape.as_list()
     del shape[dim]
     cost.set_shape(shape)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 9499fe456d1..b085a83baeb 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import custom_gradient
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -300,8 +301,7 @@ class ResourceVariable(variables.Variable):
             # Manually assign reads to the handle's device to avoid log
             # messages.
             with ops.device(self._handle.device):
-              value = gen_resource_variable_ops.read_variable_op(
-                  self._handle, dtype=self._dtype)
+              value = read_variable_op(self._handle, dtype=self._dtype)
             self._graph_element = value
             if caching_device is not None:
               # Variables may be created in a tf.device() or ops.colocate_with()
@@ -324,8 +324,8 @@ class ResourceVariable(variables.Variable):
           self._graph_element = None
           if caching_device:
             with ops.device(caching_device):
-              self._cached_value = gen_resource_variable_ops.read_variable_op(
-                  self._handle, dtype=self._dtype)
+              self._cached_value = read_variable_op(self._handle,
+                                                    dtype=self._dtype)
           else:
             self._cached_value = None
         ops.add_to_collections(collections, self)
@@ -409,8 +409,7 @@ class ResourceVariable(variables.Variable):
       return self._cached_value
     with ops.colocate_with(None, ignore_existing=True):
       with ops.device(self._handle.device):
-        return gen_resource_variable_ops.read_variable_op(
-            self._handle, dtype=self._dtype)
+        return read_variable_op(self._handle, dtype=self._dtype)
 
   def _as_graph_element(self):
     """Conversion function for Graph.as_graph_element()."""
@@ -470,9 +469,17 @@ class ResourceVariable(variables.Variable):
      the read operation.
     """
     with ops.name_scope("Read"):
-      with ops.device(self._handle.device):
-        value = gen_resource_variable_ops.read_variable_op(
-            self._handle, dtype=self._dtype)
+      # In graph mode, ensure we read the variable in the same device as the
+      # handle. In eager mode, however, this sometimes tries to read a GPU
+      # variable in the CPU because the handle is host memory. For now, then, we
+      # need to skip the device block in eager. TODO(apassos) eager should have
+      # separate notions of device and memory, so handle.device can be GPU while
+      # handle.memory_space is always CPU.
+      if context.in_graph_mode():
+        with ops.device(self._handle.device):
+          value = read_variable_op(self._handle, dtype=self._dtype)
+      else:
+        value = read_variable_op(self._handle, dtype=self._dtype)
     # Return an identity so it can get placed on whatever device the context
     # specifies instead of the device where the variable is.
     return array_ops.identity(value)
@@ -625,6 +632,31 @@ class ResourceVariable(variables.Variable):
       return self.value()
 
 
+@custom_gradient.custom_gradient
+def read_variable_op(handle, dtype):
+  """Reads the value of a variable.
+
+  The tensor returned by this operation is immutable.
+
+  The value returned by this operation is guaranteed to be influenced by all the
+  writes on which this operation depends directly or indirectly, and to not be
+  influenced by any of the writes which depend directly or indirectly on this
+  operation.
+
+  Args:
+    handle: A `Tensor` of type `resource`.
+      handle to the resource in which to store the variable.
+    dtype: A `tf.DType`. the dtype of the value.
+
+  Returns:
+    A `Tensor` of type `dtype`.
+  """
+  result = gen_resource_variable_ops.read_variable_op(handle, dtype)
+  def grad(dresult):
+    return dresult
+  return result, grad
+
+
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 

From d93a55b8592656bd73f4872ceaa6951e6565f841 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 17:18:39 -0700
Subject: [PATCH 47/70] Switch implementation of tf.reduce_* to new
 implementations that are deterministic and mostly faster (some cases use the
 CUB library).

Speed for the types bool, std::complex<float>, std::complex<double>, Eigen::half
is anywhere from 50-1000x faster.  For floats, speed for row and column
reductions increases by ~20-100%.  For floats, performance on reductions
to a scalar is performance neutral.  On P100 the float reduction to scalar is
slightly slower.

Mean reductions are more numerically stable - by doing the division after the
sum instead of before, we avoid changing mantissa bits which can introduce a
bias.

Accurate mean reductions for half (in addition to fast) - the
reduction is done internally in fp32 and cast just before the final write.

Also switch the l2loss op to use the new reductions as a demonstration of how
to use them in code.  Replacement of other major users of Eigen's reductions
will follow to enable the most common training cases to be deterministic wrt
reductions.

PiperOrigin-RevId: 165773305
---
 tensorflow/contrib/cmake/tf_tests.cmake       |   2 +
 tensorflow/core/BUILD                         |   2 +
 tensorflow/core/kernels/BUILD                 |   9 +-
 tensorflow/core/kernels/l2loss_op.cc          |  39 +-
 tensorflow/core/kernels/l2loss_op.h           |  16 +-
 tensorflow/core/kernels/l2loss_op_gpu.cu.cc   |  49 +-
 tensorflow/core/kernels/reduction_ops.h       |   3 +-
 .../core/kernels/reduction_ops_common.h       |  15 +-
 .../core/kernels/reduction_ops_gpu.cu.cc      | 210 +++++-
 .../core/kernels/reduction_ops_gpu_kernels.h  | 697 ++++++++++++++++++
 tensorflow/core/kernels/reduction_ops_test.cc | 163 +++-
 .../core/util/permutation_input_iterator.h    | 134 ++++
 .../core/util/transform_output_iterator.h     | 149 ++++
 tensorflow/python/kernel_tests/BUILD          |  20 +
 .../python/kernel_tests/reduction_ops_test.py |  69 ++
 .../kernel_tests/reduction_ops_test_big.py    |  75 ++
 16 files changed, 1537 insertions(+), 115 deletions(-)
 create mode 100644 tensorflow/core/kernels/reduction_ops_gpu_kernels.h
 create mode 100644 tensorflow/core/util/permutation_input_iterator.h
 create mode 100644 tensorflow/core/util/transform_output_iterator.h
 create mode 100644 tensorflow/python/kernel_tests/reduction_ops_test_big.py

diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 25f00de81dd..6507a9a5e07 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -289,6 +289,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Failing with TF 1.3 (TODO)
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py"
+      # Test should only be run manually
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reduction_ops_test_big.py"
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1f7eb87f18b..49b1589929c 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -412,6 +412,7 @@ tf_cuda_library(
         "util/guarded_philox_random.h",
         "util/mirror_pad_mode.h",
         "util/padding.h",
+        "util/permutation_input_iterator.h",
         "util/port.h",
         "util/saved_tensor_slice_util.h",
         "util/sparse/group_iterator.h",
@@ -423,6 +424,7 @@ tf_cuda_library(
         "util/tensor_slice_reader.h",
         "util/tensor_slice_reader_cache.h",
         "util/tensor_slice_writer.h",
+        "util/transform_output_iterator.h",
         "util/use_cudnn.h",
         "util/matmul_autotune.h",
         "util/util.h",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9f638eebee4..7dd56247f4f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2532,8 +2532,9 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "reduction_ops",
+    srcs = ["reduction_ops_gpu_kernels.h"],
     prefix = "reduction_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -2994,14 +2995,16 @@ tf_kernel_library(
 tf_kernel_library(
     name = "l2loss_op",
     prefix = "l2loss_op",
+    #srcs = ["reduction_ops_gpu_kernels.h"],
     deps = [
+        ":reduction_ops",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
         "//tensorflow/core:nn_ops_op_lib",
-        "//third_party/eigen3",
-    ],
+    ] + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_cuda_cc_test(
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index 9875cd027d5..f8ed9351579 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -27,10 +27,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
-class L2LossOp : public OpKernel {
+template <typename T>
+class L2LossOp<CPUDevice, T> : public OpKernel {
  public:
   explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
 
@@ -42,8 +41,9 @@ class L2LossOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
-    functor::L2Loss<Device, T>()(context->eigen_device<Device>(),
-                                 input.flat<T>(), output->scalar<T>());
+    const CPUDevice& d = context->eigen_device<CPUDevice>();
+    output->scalar<T>().device(d) =
+        (input.flat<T>().square() * static_cast<T>(0.5)).sum();
   }
 };
 
@@ -57,33 +57,4 @@ REGISTER_KERNEL(double);
 REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
-#if GOOGLE_CUDA
-// Forward declarations of the functor specializations for GPU.
-namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                    \
-  template <>                                                                  \
-  void L2Loss<GPUDevice, T>::operator()(const GPUDevice& d,                    \
-                                        typename TTypes<T>::ConstTensor input, \
-                                        typename TTypes<T>::Scalar output);    \
-  extern template struct L2Loss<GPUDevice, T>;
-
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(double);
-DECLARE_GPU_SPEC(Eigen::half);
-#undef DECLARE_GPU_SPEC
-}  // namespace functor
-
-// Registration of the GPU implementations.
-#define REGISTER_GPU_KERNEL(T)                                  \
-  REGISTER_KERNEL_BUILDER(                                      \
-      Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      L2LossOp<GPUDevice, T>);
-
-REGISTER_GPU_KERNEL(float);
-REGISTER_GPU_KERNEL(double);
-REGISTER_GPU_KERNEL(Eigen::half);
-#undef REGISTER_GPU_KERNEL
-
-#endif  // GOOGLE_CUDA
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h
index f7204cefdd4..4953aa237cd 100644
--- a/tensorflow/core/kernels/l2loss_op.h
+++ b/tensorflow/core/kernels/l2loss_op.h
@@ -15,25 +15,19 @@ limitations under the License.
 
 #ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_
 #define TENSORFLOW_KERNELS_L2LOSS_OP_H_
-// Functor definition for L2LossOp, must be compilable by nvcc.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
-namespace functor {
 
-// Functor used by L2LossOp to do the computations.
 template <typename Device, typename T>
-struct L2Loss {
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor input,
-                  typename TTypes<T>::Scalar output) {
-    // We flatten the input tensor and reduce on dimension 0, producing
-    // a single number which is Mul(Sum(x^2), 0.5).
-    output.device(d) = (input.square() * static_cast<T>(0.5)).sum();
-  }
+struct L2LossOp : public OpKernel {
+  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) {}
 };
 
-}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_L2LOSS_OP_H_
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
index 420df370865..73b6472254c 100644
--- a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -21,12 +21,55 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h"
+
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
-template struct functor::L2Loss<GPUDevice, float>;
-template struct functor::L2Loss<GPUDevice, double>;
-template struct functor::L2Loss<GPUDevice, Eigen::half>;
+
+// TODO(eriche): can add specialization for half2
+template <typename T>
+struct squareHalf {
+  __host__ __device__ T operator()(const T& x) const {
+    return static_cast<T>(0.5) * x * x;
+  }
+};
+
+template <typename T>
+class L2LossOp<GPUDevice, T> : public OpKernel {
+ public:
+  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // The input tensor can be of any number of dimensions, even though it's
+    // 2D in most typical applications.
+    const Tensor& input = context->input(0);
+    // The output is a single number.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    typedef cub::TransformInputIterator<T, squareHalf<T>, T*> inputIterType;
+    inputIterType input_itr((T*)input.flat<T>().data(), squareHalf<T>());
+    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
+
+    Constants<GPUDevice> constants;
+    functor::ReduceImpl<T, cub::Sum, T*, inputIterType, ReductionAxes>(
+        context, (T*)output->flat<T>().data(), input_itr, 1,
+        input.flat<T>().size(), 1, 1, 0, constants.kZero, cub::Sum(), T(0));
+  }
+};
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      L2LossOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(Eigen::half);
+#undef REGISTER_GPU_KERNEL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 5db9e6032e0..e43d2828f30 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <iostream>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
@@ -67,7 +68,7 @@ void FillIdentityEigenImpl(const Device& d, OUT_T out, const Reducer& reducer) {
 template <typename Device, typename Reducer>
 struct ReduceFunctor {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const Device& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Reducer& reducer);
 
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 553f8895232..71af9d88dc1 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -190,24 +190,24 @@ class ReductionOp : public OpKernel {
       Functor::FillIdentity(d, tmp_out.flat<T>(), reducer);
     } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
       // Reduce to a scalar.
-      Functor::Reduce(d, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
+      Functor::Reduce(ctx, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
                       constants.kZero, reducer);
     } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a matrix along 1st dimension.
-      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
                       constants.kZero, reducer);
     } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a matrix along 2nd dimension.
-      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
                       constants.kOne, reducer);
     } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
       // dimensions.
-      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
+      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
                       constants.kZeroTwo, reducer);
     } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
-      Functor::Reduce(d, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
+      Functor::Reduce(ctx, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
                       constants.kOne, reducer);
     } else {
       // If we don't hit one of the cases above, transpose the data so that
@@ -223,7 +223,7 @@ class ReductionOp : public OpKernel {
       const int64 unreduced = tmp_out.NumElements();
       const int64 reduced = shuffled.NumElements() / unreduced;
       const Tensor& const_shuffled = shuffled;
-      Functor::Reduce(d, tmp_out.flat<T>(),
+      Functor::Reduce(ctx, tmp_out.flat<T>(),
                       const_shuffled.shaped<T, 2>({unreduced, reduced}),
                       constants.kOne, reducer);
     }
@@ -258,9 +258,10 @@ namespace functor {
 template <typename Device, typename Reducer>
 struct ReduceFunctorBase {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const Device& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Reducer& reducer) {
+    const Device& d = ctx->eigen_device<Device>();
     ReduceEigenImpl(d, out, in, reduction_axes, reducer);
   }
 
diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
index ec4490db83f..cff0e95bc13 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/kernels/reduction_ops.h"
+#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h"
 
 namespace tensorflow {
 namespace functor {
@@ -33,15 +32,27 @@ typedef TTypes<float>::Tensor::Index Index;
 template <typename Reducer>
 struct ReduceFunctor<GPUDevice, Reducer> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
-                     const Reducer& reducer) {
-    ReduceEigenImpl(d, To32Bit(out), To32Bit(in), reduction_axes, reducer);
+                     const Reducer& reducer);
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::SumReducer<T>& reducer) {
+    ReduceImpl<T, cub::Sum, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Sum(), T(0));
   }
 
   template <typename OUT_T>
   static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Reducer& reducer) {
+                           const Eigen::internal::SumReducer<T>& reducer) {
     FillIdentityEigenImpl(d, To32Bit(out), reducer);
   }
 };
@@ -49,19 +60,30 @@ struct ReduceFunctor<GPUDevice, Reducer> {
 template <typename T>
 struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Eigen::internal::MeanReducer<T>& reducer) {
-    typedef typename IN_T::Index Index;
-    // Eigen sum reductions are much faster on GPU than mean reductions:
-    // Simply trigger them by computing the sum of the weighted inputs.
-    Index num_coeffs_to_reduce = 1;
-    for (int i = 0; i < Eigen::internal::array_size<ReductionAxes>::value;
-         ++i) {
-      num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
-    }
-    T scale = T(1.0 / num_coeffs_to_reduce);
-    out.device(d) = (in * scale).sum(reduction_axes);
+    int divisor = 1;
+    if (out.rank() == 0)
+      divisor = in.size();
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
+      divisor = in.dimension(0);
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2)
+      divisor = in.dimension(0) * in.dimension(2);
+    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+
+    DividesBy<T> div_op((T)divisor);
+    TransformOutputIterator<T, T, DividesBy<T>> itr((T*)out.data(), div_op);
+    ReduceImpl<T, cub::Sum, TransformOutputIterator<T, T, DividesBy<T>>, T*,
+               ReductionAxes>(ctx, itr, (T*)in.data(), in.rank(),
+                              in.dimension(0),
+                              in.rank() >= 2 ? in.dimension(1) : 1,
+                              in.rank() >= 3 ? in.dimension(2) : 1, out.rank(),
+                              reduction_axes, cub::Sum(), T(0));
   }
 
   template <typename OUT_T>
@@ -71,15 +93,159 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   }
 };
 
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<Eigen::half>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
+    float divisor = 1.f;
+    if (out.rank() == 0)
+      divisor = in.size();
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
+      divisor = in.dimension(0);
+    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2)
+      divisor = in.dimension(0) * in.dimension(2);
+    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
+      divisor = in.dimension(1);
+    DividesBy<float, Eigen::half> div_op(divisor);
+
+    typedef cub::TransformInputIterator<float, HalfToFloat, Eigen::half*>
+        inputIterType;
+    inputIterType input_itr((Eigen::half*)in.data(), HalfToFloat());
+
+    typedef TransformOutputIterator<Eigen::half, float,
+                                    DividesBy<float, Eigen::half>>
+        outputIterType;
+    outputIterType itr((Eigen::half*)out.data(), div_op);
+
+    ReduceImpl<float, cub::Sum, outputIterType, inputIterType, ReductionAxes>(
+        ctx, itr, input_itr, in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Sum(), 0.f);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(
+      const GPUDevice& d, OUT_T out,
+      const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::MaxReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::MaxReducer<T>& reducer) {
+    ReduceImpl<T, cub::Max, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Max(), std::numeric_limits<T>::min());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::MaxReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::MinReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::MinReducer<T>& reducer) {
+    ReduceImpl<T, cub::Min, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        cub::Min(), std::numeric_limits<T>::max());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::MinReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <typename T>
+struct ReduceFunctor<GPUDevice, Eigen::internal::ProdReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::ProdReducer<T>& reducer) {
+    ReduceImpl<T, Prod<T>, T*, T*, ReductionAxes>(
+        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Prod<T>(), T(1));
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::ProdReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::AndReducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::AndReducer& reducer) {
+    ReduceImpl<bool, And, bool*, bool*, ReductionAxes>(
+        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, And(),
+        true);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::AndReducer& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
+template <>
+struct ReduceFunctor<GPUDevice, Eigen::internal::OrReducer> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const Eigen::internal::OrReducer& reducer) {
+    ReduceImpl<bool, Or, bool*, bool*, ReductionAxes>(
+        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, Or(),
+        false);
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const Eigen::internal::OrReducer& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
 // T: the data type
 // REDUCER: the reducer functor
 // NUM_AXES: the number of axes to reduce
 // IN_DIMS: the number of dimensions of the input tensor
-#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                        \
-  template void ReduceFunctor<GPUDevice, REDUCER>::Reduce(           \
-      const GPUDevice& d, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
-      TTypes<T, IN_DIMS>::ConstTensor in,                            \
-      const Eigen::array<Index, NUM_AXES>& reduction_axes,           \
+#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                          \
+  template void ReduceFunctor<GPUDevice, REDUCER>::Reduce(             \
+      OpKernelContext* ctx, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
+      TTypes<T, IN_DIMS>::ConstTensor in,                              \
+      const Eigen::array<Index, NUM_AXES>& reduction_axes,             \
       const REDUCER& reducer);
 
 #define DEFINE_IDENTITY(T, REDUCER)                              \
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_kernels.h b/tensorflow/core/kernels/reduction_ops_gpu_kernels.h
new file mode 100644
index 00000000000..45a9fdd6d99
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_gpu_kernels.h
@@ -0,0 +1,697 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/device/device_reduce.cuh"
+#include "external/cub_archive/cub/device/device_segmented_reduce.cuh"
+#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh"
+#include "external/cub_archive/cub/iterator/transform_input_iterator.cuh"
+#include "external/cub_archive/cub/warp/warp_reduce.cuh"
+#include "cuda/include/cuComplex.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/reduction_ops.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/transform_output_iterator.h"
+
+#include <sstream>
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+// needed to work around a compiler bug in nvcc - it doesn't seem to like
+// the overloaded multiply op for std::complex
+template <>
+struct Prod<std::complex<float>> {
+  __host__ __device__ std::complex<float> operator()(
+      const std::complex<float>& a, const std::complex<float>& b) const {
+    auto result = cuCmulf(make_cuComplex(a.real(), a.imag()),
+                          make_cuComplex(b.real(), b.imag()));
+    return std::complex<float>(result.x, result.y);
+  }
+};
+
+template <>
+struct Prod<std::complex<double>> {
+  __host__ __device__ std::complex<double> operator()(
+      const std::complex<double>& a, const std::complex<double>& b) const {
+    auto result = cuCmul(make_cuDoubleComplex(a.real(), a.imag()),
+                         make_cuDoubleComplex(b.real(), b.imag()));
+    return std::complex<double>(result.x, result.y);
+  }
+};
+
+template <typename T, typename outT = T>
+struct DividesBy {
+  T divisor;
+
+  __host__ __device__ explicit DividesBy(T divisor) : divisor(divisor) {}
+
+  __host__ __device__ outT operator()(const T& x) const { return x / divisor; }
+};
+
+// needed to work around a compiler bug in nvcc - it doesn't seem to like
+// the overloaded ops for std::complex
+template <>
+struct DividesBy<std::complex<float>> {
+  cuFloatComplex divisor;
+
+  __host__ __device__ explicit DividesBy(std::complex<float> divisor)
+      : divisor(make_cuComplex(divisor.real(), divisor.imag())) {}
+
+  // implements
+  __host__ __device__ std::complex<float> operator()(
+      const std::complex<float>& x) const {
+    auto result = cuCdivf(make_cuComplex(x.real(), x.imag()), divisor);
+    return std::complex<float>(result.x, result.y);
+  }
+};
+
+template <>
+struct DividesBy<std::complex<double>> {
+  cuDoubleComplex divisor;
+
+  __host__ __device__ explicit DividesBy(std::complex<double> divisor)
+      : divisor(make_cuDoubleComplex(divisor.real(), divisor.imag())) {}
+
+  // implements
+  __host__ __device__ std::complex<double> operator()(
+      const std::complex<double>& x) const {
+    auto result = cuCdiv(make_cuDoubleComplex(x.real(), x.imag()), divisor);
+    return std::complex<double>(result.x, result.y);
+  }
+};
+
+template <>
+struct DividesBy<float, Eigen::half> {
+  float divisor;
+
+  __host__ __device__ explicit DividesBy(float divisor) : divisor(divisor) {}
+
+  __host__ __device__ Eigen::half operator()(const float& x) const {
+    return Eigen::half(x / divisor);
+  }
+};
+
+struct HalfToFloat {
+  __host__ __device__ float operator()(const Eigen::half& x) const {
+    return Eigen::half_impl::half_to_float(x);
+  }
+};
+
+struct FloatToHalf {
+  __host__ __device__ Eigen::half operator()(const float& x) const {
+    return Eigen::half_impl::float_to_half_rtne(x);
+  }
+};
+
+struct And {
+  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
+    return a && b;
+  }
+};
+
+struct Or {
+  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
+    return a || b;
+  }
+};
+
+// each block does a grid strided loop and reduces its values locally
+// the case of one block is used for low latency small reductions to scalars
+template <typename T, typename outT, int num_threads, typename Op>
+__global__ void BlockReduceKernel(T in, outT out, int num_elems, Op op) {
+  const int bid = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  const int gid = bid * blockDim.x + tid;
+  const int stride = blockDim.x * gridDim.x;
+
+  typedef typename std::iterator_traits<T>::value_type value_type;
+
+  value_type sum;
+  if (gid < num_elems) {
+    sum = in[gid];
+    for (int pos = gid + stride; pos < num_elems; pos += stride) {
+      sum = op(sum, in[pos]);
+    }
+  } else
+    sum = value_type();  // stop compiler from complaining
+
+  typedef cub::BlockReduce<value_type, num_threads> BlockReduce;
+
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  __syncthreads();
+
+  sum = BlockReduce(temp_storage)
+            .template Reduce(sum, op, min(num_elems, num_threads));
+
+  if (tid == 0) out[bid] = sum;
+}
+
+// maps a warp to each row
+template <typename T, typename outT, typename Op>
+__global__ void RowReduceKernel(T in, outT out, int num_rows, int num_cols,
+                                Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  const int lane = threadIdx.x % 32;
+
+  if (num_cols == 1) {
+    int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (gid < num_rows) out[gid] = in[gid];
+    return;
+  }
+
+  value_type sum;
+  int col = lane;
+  if (row < num_rows && col < num_cols) {
+    sum = in[row * num_cols + col];
+    col += 32;
+    for (; col < num_cols; col += 32) {
+      sum = op(sum, in[row * num_cols + col]);
+    }
+  } else {
+    sum = value_type();  // stop compiler from complaining
+  }
+
+  typedef cub::WarpReduce<value_type> WarpReduce;
+
+  __shared__ typename WarpReduce::TempStorage temp_storage;
+
+  __syncthreads();
+
+  sum = WarpReduce(temp_storage).template Reduce(sum, op, min(num_cols, 32));
+
+  if (row < num_rows && lane == 0) out[row] = sum;
+}
+
+// Works only if there are <= 16 columns
+// each warps sums over multiple rows at once
+template <typename T, typename outT, typename Op>
+__global__ void ColumnReduceMax16ColumnsKernel(T in, outT out, int num_rows,
+                                               int num_cols, Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  int rows_per_warp = 32 / num_cols;
+
+  int lane = threadIdx.x % 32;
+  int lane_row = lane / num_cols;
+
+  const int start_row_warp =
+      rows_per_warp * (blockIdx.y * blockDim.y + threadIdx.y);
+  const int start_row_lane = start_row_warp + lane_row;
+  int row = start_row_lane;
+  int col = lane % num_cols;
+
+  value_type sum;
+  if (row * num_cols + col < num_rows * num_cols)
+    sum = in[row * num_cols + col];
+  else
+    sum = value_type();  // needed to shut up compiler
+
+  __shared__ value_type partial_sums[32][33];
+
+  __syncthreads();
+
+  row += rows_per_warp * gridDim.y * blockDim.y;
+  for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
+    int global_pos = row * num_cols + col;
+    if (global_pos < (num_rows * num_cols))
+      sum = op(sum, in[row * num_cols + col]);
+  }
+
+  const int rows_in_this_warp = min(rows_per_warp, num_rows - start_row_warp);
+  // not the most efficient way to do this sum
+  for (int i = 1; i < rows_in_this_warp; ++i) {
+    value_type tmp =
+        cub::ShuffleIndex(sum, threadIdx.x + i * num_cols, 32, 0xffffffff);
+    if (lane < num_cols) sum = op(sum, tmp);
+  }
+
+  if (lane < num_cols) partial_sums[lane][threadIdx.y] = sum;
+
+  __syncthreads();
+
+  if (threadIdx.y == 0 && threadIdx.x < num_cols) {
+    value_type s = partial_sums[threadIdx.x][0];
+
+    if (blockDim.y > 1) {
+      for (int row = 1; row < blockDim.y; ++row) {
+        s = op(s, partial_sums[threadIdx.x][row]);
+      }
+    }
+
+    out[col * gridDim.y + blockIdx.y] = s;
+  }
+}
+
+// Maps each block to a column range 32 wide
+template <typename T, typename outT, typename Op>
+__global__ void ColumnReduceKernel(T in, outT out, int num_rows, int num_cols,
+                                   Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  int col = blockIdx.x * 32 + threadIdx.x;
+
+  value_type sum;
+  if (row * num_cols + col < num_rows * num_cols)
+    sum = in[row * num_cols + col];
+  else
+    sum = value_type();  // will never be used, needed to shut up compiler
+
+  __shared__ value_type partial_sums[32][33];
+
+  __syncthreads();
+
+  row += gridDim.y * blockDim.y;
+
+  if (col < num_cols) {
+    for (; row < num_rows; row += gridDim.y * blockDim.y) {
+      sum = op(sum, in[row * num_cols + col]);
+    }
+  }
+
+  partial_sums[threadIdx.x][threadIdx.y] = sum;
+
+  __syncthreads();
+
+  if (threadIdx.y == 0 && threadIdx.x < 32) {
+    value_type s = partial_sums[threadIdx.x][0];
+
+    for (int row = 1; row < blockDim.y; ++row) {
+      s = op(s, partial_sums[threadIdx.x][row]);
+    }
+
+    out[col * gridDim.y + blockIdx.y] = s;
+  }
+}
+
+// does multiple warp size segmented reductions in parallel
+// segments cannot cross warp boundaries (mainly used for reducing the segments
+// that come from the Max16Columns column reduction kernel)
+template <typename T, typename outT, typename Op>
+__global__ void CleanupSegments(T partial_sums, outT out, int num_rows,
+                                int num_cols, int segment_size, Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  value_type val;
+  if (tid < segment_size * num_cols)
+    val = partial_sums[tid];
+  else
+    val = value_type();  // 0s beyond last segment won't be used, so OK
+
+  typedef cub::WarpReduce<value_type> WarpReduce;
+
+  __shared__ typename WarpReduce::TempStorage temp_storage;
+
+  __syncthreads();
+
+  bool head_flag = (threadIdx.x % segment_size) == 0;
+  value_type sum =
+      WarpReduce(temp_storage).HeadSegmentedReduce(val, head_flag, op);
+
+  if (head_flag && tid < segment_size * num_cols) {
+    out[tid / segment_size] = sum;
+  }
+}
+
+// assigns one thread to a column
+template <typename T, typename outT, typename Op>
+__global__ void ColumnReduceSimpleKernel(T in, outT out, int num_planes,
+                                         int num_rows, int num_cols, Op op) {
+  typedef typename std::iterator_traits<T>::value_type value_type;
+  const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int elems_per_plane = num_rows * num_cols;
+
+  int plane = gid / num_cols;
+  int col = gid % num_cols;
+
+  if (plane >= num_planes) return;
+
+  if (num_rows == 1) {
+    out[plane * elems_per_plane + col] = in[plane * elems_per_plane + col];
+    return;
+  }
+
+  value_type sum = op(in[plane * elems_per_plane + col],
+                      in[plane * elems_per_plane + num_cols + col]);
+  for (int row = 2; row < num_rows; ++row) {
+    sum = op(sum, in[plane * elems_per_plane + row * num_cols + col]);
+  }
+
+  out[plane * num_cols + col] = sum;
+}
+
+struct RowOffset {
+  __host__ __device__ explicit RowOffset(const int& cols) : cols_(cols) {}
+
+  __host__ __device__ int operator()(const int& x) const { return cols_ * x; }
+
+  int cols_;
+};
+
+struct GatherOp {
+  __host__ __device__ GatherOp(const int& extent_x, const int& extent_y,
+                               const int& extent_z, bool kOne)
+      : extent_x_(extent_x),
+        extent_y_(extent_y),
+        extent_z_(extent_z),
+        kOne_(kOne) {
+    if (kOne_)
+      group_size_ = extent_y_;
+    else
+      group_size_ = extent_x_ * extent_z_;
+  }
+
+  __host__ __device__ int operator()(const int& ind) const {
+    const int group = kOne_ ? ind / group_size_ : ind % group_size_;
+    const int offset = kOne_ ? ind % group_size_ : ind / group_size_;
+
+    const int x = group / extent_z_;
+    const int z = group % extent_z_;
+
+    return x * extent_y_ * extent_z_ + z + offset * extent_z_;
+  }
+
+  int extent_x_;
+  int extent_y_;
+  int extent_z_;
+  bool kOne_;
+  int group_size_;
+};
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
+                           int in_size, Op op, T init,
+                           const cudaStream_t& cu_stream) {
+  // handle situations where low latency is important better than CUB
+  if (in_size <= 4096) {
+    const int num_blocks = 1;
+    const int num_threads = 256;
+    BlockReduceKernel<IN_T, OUT_T, num_threads>
+        <<<num_blocks, num_threads, 0, cu_stream>>>(in, out, in_size, op);
+    return;
+  } else if (in_size <= 1 << 19) {
+    const int num_threads = 256;
+    const int num_blocks = 32;  // it seems like tailoring this to the GPU
+                                // would be more effective, but all attempts
+                                // at making this a multiple of the number of
+                                // multiprocessors have lead to lower perf
+                                // in general
+                                // TODO(eriche) investigate this more
+
+    Tensor temp_storage;
+    OP_REQUIRES_OK(
+        ctx,
+        ctx->allocate_temp(
+            DT_INT8, TensorShape({static_cast<int64>(num_blocks * sizeof(T))}),
+            &temp_storage));
+
+    BlockReduceKernel<IN_T, T*, num_threads>
+        <<<num_blocks, num_threads, 0, cu_stream>>>(
+            in, (T*)temp_storage.flat<int8_t>().data(), in_size, op);
+
+    CleanupSegments<<<1, num_blocks, 0, cu_stream>>>(
+        (T*)temp_storage.flat<int8_t>().data(), out, 1, 1, num_blocks, op);
+    return;
+  }
+  std::size_t temp_storage_bytes = 0;
+
+  Tensor temp_storage;
+  // written as a loop because it reduces clutter
+  // first pass allocates memory, second launches kernel(s)
+  for (int i = 0; i < 2; ++i) {
+    auto success = cub::DeviceReduce::Reduce(
+        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
+        temp_storage_bytes, in, out, in_size, op, init, cu_stream);
+
+    OP_REQUIRES(
+        ctx, success == 0,
+        errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+
+    if (i == 0)
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+              &temp_storage));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
+                        int num_cols, Op op, T init,
+                        const cudaStream_t& cu_stream) {
+  if (num_cols < 1024) {
+    const int threads_per_block = 128;
+    const int warps_per_block = threads_per_block / 32;
+    int num_blocks = (num_rows + warps_per_block - 1) / warps_per_block;
+
+    RowReduceKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
+        in, out, num_rows, num_cols, op);
+    return;
+  }
+
+  // setup segment offsets with counting and transform iterator
+  RowOffset row_offset_op(num_cols);
+  cub::CountingInputIterator<int> counting_iter(0);
+  cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
+      transform_iter(counting_iter, row_offset_op);
+
+  std::size_t temp_storage_bytes = 0;
+  Tensor temp_storage;
+  for (int i = 0; i < 2; ++i) {
+    auto success = cub::DeviceSegmentedReduce::Reduce(
+        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
+        temp_storage_bytes, in, out, num_rows, transform_iter,
+        transform_iter + 1, op, init, cu_stream);
+
+    OP_REQUIRES(ctx, success == 0,
+                errors::Internal("CUB segmented reduce error",
+                                 cudaGetErrorString(success)));
+
+    if (i == 0)
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+              &temp_storage));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
+                                     int extent_x, int extent_y, Op op, T init,
+                                     const cudaStream_t& cu_stream) {
+  int rows_per_warp = 32 / extent_y;
+  dim3 block_dim(32, min(Eigen::divup(extent_x, rows_per_warp), 32), 1);
+  dim3 grid_dim(1,
+                Eigen::divup(static_cast<unsigned int>(extent_x),
+                             rows_per_warp * block_dim.y),
+                1);
+
+  grid_dim.y = min((int)grid_dim.y, 32);
+
+  if (grid_dim.y > 2 && grid_dim.y < 32) {
+    int log2 = Log2Floor(grid_dim.y);
+    grid_dim.y = 1 << log2;
+  }
+
+  if (grid_dim.y == 1) {
+    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
+        in, out, extent_x, extent_y, op);
+  } else {
+    Tensor temp_storage;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_INT8,
+                                      TensorShape({static_cast<int64>(
+                                          sizeof(T) * extent_y * grid_dim.y)}),
+                                      &temp_storage));
+    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
+        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op);
+
+    dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
+    dim3 num_threads(128, 1, 1);
+    CleanupSegments<<<new_grid_dim, block_dim, 0, cu_stream>>>(
+        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
+        grid_dim.y, op);
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
+                                       int extent_x, int extent_y, Op op,
+                                       T init, const cudaStream_t& cu_stream) {
+  dim3 block_dim(32, min(extent_x, 32), 1);
+  dim3 grid_dim((extent_y + 31) / 32, 1, 1);
+
+  if (grid_dim.x < 16) grid_dim.y = min((extent_x + 31) / 32, 32);
+
+  if (grid_dim.y > 2 && grid_dim.y < 32) {
+    int log2 = Log2Floor(grid_dim.y);
+    grid_dim.y = 1 << log2;
+  }
+
+  if (grid_dim.y == 1) {
+    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(in, out, extent_x,
+                                                              extent_y, op);
+  } else {
+    Tensor temp_storage;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DT_INT8,
+                                      TensorShape({static_cast<int64>(
+                                          sizeof(T) * extent_y * grid_dim.y)}),
+                                      &temp_storage));
+
+    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
+        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op);
+
+    dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
+    dim3 num_threads(128, 1, 1);
+    CleanupSegments<<<new_grid_dim, block_dim, 0, cu_stream>>>(
+        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
+        grid_dim.y, op);
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void LaunchColumnReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
+                           int extent_x, int extent_y, Op op, T init,
+                           const cudaStream_t& cu_stream) {
+  if (extent_y <= 16) {
+    LaunchColumnReduction_LTE16Cols(ctx, out, in, extent_x, extent_y, op, init,
+                                    cu_stream);
+  } else if (extent_y <= 4096) {
+    LaunchColumnReduction_LTE4096Cols(ctx, out, in, extent_x, extent_y, op,
+                                      init, cu_stream);
+  } else {
+    int threads_per_block = 128;
+    int num_blocks = Eigen::divup(extent_y, threads_per_block);
+
+    ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
+        in, out, 1, extent_x, extent_y, op);
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void Launch3DYReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
+                        int extent_y, int extent_z, Op op, T init,
+                        const cudaStream_t& cu_stream) {
+  int threads_per_block = 128;
+  int num_blocks =
+      (extent_x * extent_z + threads_per_block - 1) / threads_per_block;
+
+  // TODO (eriche): this won't be very good in the case of small x
+  //                small z and large y.
+  ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
+      in, out, extent_x, extent_y, extent_z, op);
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T>
+void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
+                         int extent_y, int extent_z, Op op, T init,
+                         const cudaStream_t& cu_stream) {
+  // setup segment offsets with counting and transform iterator
+  RowOffset row_offset_op(extent_x * extent_z);
+  cub::CountingInputIterator<int> counting_iter(0);
+  cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
+      transform_iter(counting_iter, row_offset_op);
+
+  GatherOp gather_op(extent_x, extent_y, extent_z, false);
+  typedef cub::TransformInputIterator<int, GatherOp,
+                                      cub::CountingInputIterator<int>>
+      gatherIterType;
+  gatherIterType gather_iter(counting_iter, gather_op);
+
+  PermutationInputIterator<T, IN_T, gatherIterType> permute_iter(in,
+                                                                 gather_iter);
+
+  std::size_t temp_storage_bytes = 0;
+  Tensor temp_storage;
+
+  for (int i = 0; i < 2; ++i) {
+    auto success = cub::DeviceSegmentedReduce::Reduce(
+        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
+        temp_storage_bytes, permute_iter, out, extent_y, transform_iter,
+        transform_iter + 1, op, init, cu_stream);
+
+    OP_REQUIRES(ctx, success == 0,
+                errors::Internal("CUB segmented reduce error",
+                                 cudaGetErrorString(success)));
+
+    if (i == 0)
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+              &temp_storage));
+  }
+}
+
+template <typename T, typename Op, typename OUT_T, typename IN_T,
+          typename ReductionAxes>
+void ReduceImpl(OpKernelContext* ctx, OUT_T out, IN_T in, int in_rank,
+                int in_dim0, int in_dim1, int in_dim2, int out_rank,
+                const ReductionAxes& reduction_axes, Op op, T init) {
+  const cudaStream_t& cu_stream = GetCudaStream(ctx);
+  if (out_rank == 0) {
+    const int in_size = in_dim0 * in_dim1 * in_dim2;
+    LaunchScalarReduction(ctx, out, in, in_size, op, init, cu_stream);
+  } else if (in_rank == 2 && out_rank == 1 &&
+             reduction_axes[0] == 1) {  // row reduction
+    LaunchRowReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
+  } else if (in_rank == 2 && out_rank == 1 &&
+             reduction_axes[0] == 0) {  // column reduction
+    LaunchColumnReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
+  } else if (in_rank == 3 && out_rank == 2 && reduction_axes[0] == 1) {
+    Launch3DYReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
+                       cu_stream);
+  } else if (in_rank == 3 && out_rank == 1 && reduction_axes[0] == 0 &&
+             reduction_axes[1] == 2) {
+    Launch3DXZReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
+                        cu_stream);
+  } else {
+    std::stringstream ss;
+    ss << "Invalid reduction requested: in_rank, out_rank, axes " << in_rank
+       << " " << out_rank;
+    if (out_rank == 1) ss << " " << reduction_axes[0];
+    if (out_rank == 2) ss << " " << reduction_axes[1];
+    LOG(FATAL) << ss.str();
+  }
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index 9cdebdd4f23..9bbe993a2f9 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -22,14 +23,59 @@ namespace tensorflow {
 
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
-static Graph* ToScalar(const string& reduce, int num) {
-  Graph* g = new Graph(OpRegistry::Global());
-  Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
-  data.flat<float>().setRandom();
-  Tensor axes(DT_INT32, TensorShape({3}));
+template <typename T>
+static Graph* ToScalar(const string& reduce, int num_x, int num_y) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x, num_y}));
+  data.flat<T>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({2}));
   axes.flat<int32>()(0) = 0;
   axes.flat<int32>()(1) = 1;
-  axes.flat<int32>()(2) = 2;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ColReduce(const string& reduce, int num_x, int num_y) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({1}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* RowReduce(const string& reduce, int num_x, int num_y) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({1}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({1}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({2}));
+  axes.flat<int32>()(0) = 0;
+  axes.flat<int32>()(1) = 2;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
@@ -37,51 +83,100 @@ static Graph* ToScalar(const string& reduce, int num) {
 
 // Creates a bench which reduces a 3D tensor with total "num" floats
 // into a scalar on a "device". Runs the bench for "iters" times.
+template <typename T>
 static void ReduceToScalar(int iters, const string& device,
-                           const string& reduce, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float));
-  test::Benchmark(device, ToScalar(reduce, num)).Run(iters);
+                           const string& reduce, int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(T));
+  test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
 }
 
-static void BM_Sum3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Sum", num);
+static void DoRowReduce(int iters, const string& device, const string& reduce,
+                        int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Max3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Max", num);
+static void DoColReduce(int iters, const string& device, const string& reduce,
+                        int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Prod3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Prod", num);
+static void Do3DYReduce(int iters, const string& device, const string& reduce,
+                        int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Mean3DToScalarCPU(int iters, int num) {
-  ReduceToScalar(iters, "cpu", "Mean", num);
+static void Do3DXZReduce(int iters, const string& device, const string& reduce,
+                         int num_x, int num_y) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
 }
-BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Sum3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Sum", num);
+static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Max3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Max", num);
+static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
+  ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Prod3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Prod", num);
+static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
+  ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);
 
-static void BM_Mean3DToScalarGPU(int iters, int num) {
-  ReduceToScalar(iters, "gpu", "Mean", num);
+static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
+  DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
 }
-BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20);
+BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
+  DoColReduce(iters, "gpu", "Sum", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
+  Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
+  Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
+}
+BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
+static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
+}
+BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
+static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
+}
+BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
+static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
+}
+BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h
new file mode 100644
index 00000000000..f6375b25157
--- /dev/null
+++ b/tensorflow/core/util/permutation_input_iterator.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename InputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationInputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationInputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;   ///< The type of a pointer to an element the
+                                ///< iterator can point to
+  typedef ValueType reference;  ///< The type of a reference to an element the
+                                ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  InputIteratorT input_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationInputIterator(
+      InputIteratorT input_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)  ///< Conversion functor to wrap
+      : input_itr(input_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return input_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(input_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(input_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return input_itr[index_itr[n]];
+  }
+
+  /// Structure dereference
+  __host__ __device__ __forceinline__ pointer operator->() {
+    return input_itr + *index_itr;
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && input_itr == rhs.input_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/transform_output_iterator.h b/tensorflow/core/util/transform_output_iterator.h
new file mode 100644
index 00000000000..1640791ad17
--- /dev/null
+++ b/tensorflow/core/util/transform_output_iterator.h
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename StoreType, typename InputType, typename ConversionOp,
+          typename OffsetT = ptrdiff_t>
+class TransformOutputIterator {
+ private:
+  // Proxy object
+  struct Reference {
+    StoreType* ptr;
+    ConversionOp conversion_op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ Reference(StoreType* ptr,
+                                                  ConversionOp conversion_op)
+        : ptr(ptr), conversion_op(conversion_op) {}
+
+    /// Assignment
+    __host__ __device__ __forceinline__ InputType operator=(InputType val) {
+      *ptr = conversion_op(val);
+      return val;
+    }
+  };
+
+ public:
+  // Required iterator traits
+  typedef TransformOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;            ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef void
+      value_type;        ///< The type of the element the iterator can point to
+  typedef void pointer;  ///< The type of a pointer to an element the iterator
+                         ///< can point to
+  typedef Reference reference;  ///< The type of a reference to an element the
+                                ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+  /*private:*/
+
+  StoreType* ptr;
+  ConversionOp conversion_op;
+
+ public:
+  /// Constructor
+  template <typename QualifiedStoreType>
+  __host__ __device__ __forceinline__ TransformOutputIterator(
+      QualifiedStoreType* ptr,
+      ConversionOp conversionOp)  ///< Native pointer to wrap
+      : ptr(ptr), conversion_op(conversionOp) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    ptr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    ptr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return Reference(ptr, conversion_op);
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(ptr + n, conversion_op);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    ptr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(ptr - n, conversion_op);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    ptr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return ptr - other.ptr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return Reference(ptr + n, conversion_op);
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (ptr == rhs.ptr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return (ptr != rhs.ptr);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 797112b5381..b6daad3ddfe 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1679,6 +1679,26 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
+cuda_py_test(
+    name = "reduction_ops_test_big",
+    size = "medium",
+    srcs = ["reduction_ops_test_big.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+    ],
+    tags = [
+        "manual",
+        "no_gpu",
+        "nogpu",
+        "noguitar",
+        "notap",
+    ],
+)
+
 cuda_py_test(
     name = "relu_op_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 1dfc7f48d57..921943b69ca 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -175,11 +175,80 @@ class SumReductionTest(BaseReductionTest):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  def testFloat16(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float16)
+      self._compareAllAxes(np_arr)
+
+    # test that mean doesn't overflow
+    # only on GPU, since it has the more accurate implementation
+    if not test.is_gpu_available():
+      return
+
+    arr = np.ones([68000], dtype=np.float16)
+
+    with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+      tf_arr = array_ops.constant(arr)
+      tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
+      tf_out_mean = sess.run(tf_mean)
+    self.assertAllClose(tf_out_mean, 1.)
+
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+    # make sure we test all possible kernel invocations
+    # logic is the same for all ops, test just float32 for brevity
+    for size_x in [
+        1, 3, 16, 17, 32, 33, 256, 257, 512, 513, 1024, 1025, 4096, 4097
+    ]:
+      for size_y in [
+          1, 3, 16, 17, 32, 33, 256, 257, 512, 513, 1024, 1025, 4096, 4097
+      ]:
+        arr = np.ones([size_x, size_y], dtype=np.float32)
+        col_sum = np.sum(arr, axis=0)
+        row_sum = np.sum(arr, axis=1)
+
+        with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+          tf_row_sum = self._tf_reduce(arr, 1, False)
+          tf_col_sum = self._tf_reduce(arr, 0, False)
+          tf_out_row, tf_out_col = sess.run([tf_row_sum, tf_col_sum])
+        self.assertAllClose(col_sum, tf_out_col)
+        self.assertAllClose(row_sum, tf_out_row)
+
+    for _ in range(1000):
+      size_x = int(2**np.random.uniform(0, 18))
+      size_y = int(2**np.random.uniform(0, 18))
+
+      if size_x * size_y > 1e7:
+        size_y = int(1e7 / size_x)
+
+      arr = np.ones([size_x, size_y], dtype=np.float32)
+      col_sum = np.sum(arr, axis=0)
+      row_sum = np.sum(arr, axis=1)
+
+      with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+        tf_row_sum = self._tf_reduce(arr, 1, False)
+        tf_col_sum = self._tf_reduce(arr, 0, False)
+        tf_out_row, tf_out_col = sess.run([tf_row_sum, tf_col_sum])
+      self.assertAllClose(col_sum, tf_out_col)
+      self.assertAllClose(row_sum, tf_out_row)
+
+    for size_x in [1, 3, 16, 33, 65, 129]:
+      for size_y in [1, 3, 16, 33, 65, 129]:
+        for size_z in [1, 3, 16, 33, 65, 129]:
+          arr = np.ones([size_x, size_y, size_z], dtype=np.float32)
+          sum_y = np.sum(arr, axis=1)
+          sum_xz = np.sum(arr, axis=(0, 2))
+
+          with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+            tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
+            tf_sum_y = self._tf_reduce(arr, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+          self.assertAllClose(sum_y, tf_out_sum_y)
+          self.assertAllClose(sum_xz, tf_out_sum_xz)
+
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
new file mode 100644
index 00000000000..99fea62f98f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
@@ -0,0 +1,75 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for reduction ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BaseReductionTest(test.TestCase):
+
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    raise NotImplementedError()
+
+
+class SumReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+
+  def testFloat32(self):
+    # make sure we test all possible kernel invocations
+    # logic is the same for all ops, test just float32 for brevity
+    for size_x in range(1, 4105, 27):
+      for size_y in range(1, 4105, 27):
+        arr = np.ones([size_x, size_y], dtype=np.float32)
+        col_sum = np.ones([size_y], dtype=np.float32) * size_x
+        row_sum = np.ones([size_x], dtype=np.float32) * size_y
+        full_sum = np.ones([], dtype=np.float32) * size_x * size_y
+
+        with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+          tf_row_sum = self._tf_reduce(arr, 1, False)
+          tf_col_sum = self._tf_reduce(arr, 0, False)
+          tf_full_sum = self._tf_reduce(arr, [0, 1], False)
+          tf_out_row, tf_out_col, tf_out_full = sess.run(
+              [tf_row_sum, tf_col_sum, tf_full_sum])
+        self.assertAllClose(col_sum, tf_out_col)
+        self.assertAllClose(row_sum, tf_out_row)
+        self.assertAllClose(full_sum, tf_out_full)
+
+    for size_x in range(1, 130, 3):
+      for size_y in range(1, 130, 3):
+        for size_z in range(1, 130, 3):
+          arr = np.ones([size_x, size_y, size_z], dtype=np.float32)
+          sum_y = np.sum(arr, axis=1)
+          sum_xz = np.sum(arr, axis=(0, 2))
+
+          with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
+            tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
+            tf_sum_y = self._tf_reduce(arr, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+          self.assertAllClose(sum_y, tf_out_sum_y)
+          self.assertAllClose(sum_xz, tf_out_sum_xz)
+
+
+if __name__ == "__main__":
+  test.main()

From 402d2522f7e006f221d8c2bff3e4d1153f6042c7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 17:24:50 -0700
Subject: [PATCH 48/70] Add support for filter_format specification similar to
 data_format.

PiperOrigin-RevId: 165773859
---
 tensorflow/core/util/tensor_format.cc      |  39 +++
 tensorflow/core/util/tensor_format.h       | 389 +++++++++++++++++----
 tensorflow/core/util/tensor_format_test.cc | 198 +++++++++++
 3 files changed, 557 insertions(+), 69 deletions(-)
 create mode 100644 tensorflow/core/util/tensor_format_test.cc

diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index 70d9bc8e72d..8c833650ca1 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -25,6 +25,14 @@ string GetConvnet3dDataFormatAttrString() {
   return "data_format: { 'NDHWC', 'NCDHW' } = 'NDHWC' ";
 }
 
+string GetConvnetFilterFormatAttrString() {
+  return "filter_format: { 'HWIO', 'OIHW' } = 'HWIO' ";
+}
+
+string GetConvnet3dFilterFormatAttrString() {
+  return "filter_format: { 'DHWIO', 'OIDHW' } = 'DHWIO' ";
+}
+
 string ToString(TensorFormat format) {
   switch (format) {
     case FORMAT_NHWC:
@@ -39,6 +47,20 @@ string ToString(TensorFormat format) {
   }
 }
 
+string ToString(FilterTensorFormat format) {
+  switch (format) {
+    case FORMAT_HWIO:
+      return "HWIO";
+    case FORMAT_OIHW:
+      return "OIHW";
+    case FORMAT_OIHW_VECT_I:
+      return "OIHW_VECT_I";
+    default:
+      LOG(FATAL) << "Invalid Filter Format: " << static_cast<int32>(format);
+      return "INVALID_FORMAT";
+  }
+}
+
 bool FormatFromString(const string& format_str, TensorFormat* format) {
   if (format_str == "NHWC" || format_str == "NDHWC") {
     *format = FORMAT_NHWC;
@@ -55,4 +77,21 @@ bool FormatFromString(const string& format_str, TensorFormat* format) {
   return false;
 }
 
+bool FilterFormatFromString(const string& format_str,
+                            FilterTensorFormat* format) {
+  if (format_str == "HWIO" || format_str == "DHWIO") {
+    *format = FORMAT_HWIO;
+    return true;
+  }
+  if (format_str == "OIHW" || format_str == "OIDHW") {
+    *format = FORMAT_OIHW;
+    return true;
+  }
+  if (format_str == "OIHW_VECT_I") {
+    *format = FORMAT_OIHW_VECT_I;
+    return true;
+  }
+  return false;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 83f95004907..646673512cf 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -25,25 +25,67 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Tensor format for input/output activations used in convolution operations.
+// The mnemonics specify the meaning of each tensor dimension sorted from
+// largest to smallest memory stride.
+// N = Batch, H = Image Height, W = Image Width, C = Number of Channels.
 enum TensorFormat {
+  // FORMAT_NHWC is the default format in TensorFlow.
   FORMAT_NHWC = 0,
+
+  // FORMAT_NCHW often improves performance on GPUs.
   FORMAT_NCHW = 1,
+
   // NCHW_VECT_C is the most performant tensor format for cudnn6's quantized
-  // convolution. It is laid out in the same order as NCHW, but each element of
-  // a tensor in this format is a vector of 4 feature maps. A batch image with
-  // dimension sizes [N,C,H,W] is represented as a 5D tensor with shape
-  // [N,C/4,H,W,4]. This format requires C to be a multiple of 4, and requires
-  // the data type to be int8.
+  // int8 convolution and fused convolution. It is laid out in the same order
+  // as NCHW, except that the size of the Channels dimension is divided by 4,
+  // and a new dimension of size 4 is appended, which packs 4 adjacent channel
+  // activations for the same pixel into an int32. Thus an NCHW format tensor
+  // with dimensions [N, C, H, W] would have dimensions [N, C/4, H, W, 4] in
+  // NCHW_VECT_C format.
+  // A pre-condition of this format is that C must be a multiple of 4.
   FORMAT_NCHW_VECT_C = 2,
 };
 
+// Tensor format for convolutional filters.
+// The mnemonics specify the meaning of each tensor dimension sorted
+// from largest to smallest memory stride.
+// H = Kernel Height, W = Kernel Width, I = Input Channels, O = Output Channels.
+// Note: In cudnnGetFilter4dDescriptor(), 'O' is called 'K', 'I' is called 'C'.
+enum FilterTensorFormat {
+  // FORMAT_HWIO is the default filter format in TensorFlow.
+  // Ops that do not have a 'filter_format' attribute will assume this format.
+  FORMAT_HWIO = 0,
+
+  // FORMAT_OIHW often improves performance on GPUs.
+  FORMAT_OIHW = 1,
+
+  // OIHW_VECT_I is the most performant tensor format for cudnn6's quantized
+  // int8 convolution and fused convolution. It is analagous to the NCHW_VECT_C
+  // data format. It is laid out in the same order as OIHW, except that the size
+  // of the Input Channels dimension is divided by 4, and a new dimension of
+  // size 4 is appended, which packs 4 adjacent input channel weights into an
+  // int32. Thus an OIHW format filter with dimensions [O, I, H, W] would have
+  // dimensions [O, I/4, H, W, 4] in OIHW_VECT_I format.
+  // A pre-condition of this format is that I must be a multiple of 4.
+  FORMAT_OIHW_VECT_I = 2,
+};
+
 // Parse tensor format from the given string.
 // Return true if the parsing succeeds, and false if it fails.
 bool FormatFromString(const string& format_str, TensorFormat* format);
 
+// Parse tensor format from the given string.
+// Return true if the parsing succeeds, and false if it fails.
+bool FilterFormatFromString(const string& format_str,
+                            FilterTensorFormat* format);
+
 // Convert a tensor format into string.
 string ToString(TensorFormat format);
 
+// Convert a filter tensor format into string.
+string ToString(FilterTensorFormat format);
+
 // Returns the number of spatial dims of a tensor of rank 'num_dims' and tensor
 // format 'format'.
 inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
@@ -54,6 +96,14 @@ inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
   }
 }
 
+inline int GetFilterTensorSpatialDims(int num_dims, FilterTensorFormat format) {
+  if (format == FORMAT_OIHW_VECT_I) {
+    return num_dims - 3;  // Exclude O,I,InnerI.
+  } else {
+    return num_dims - 2;  // Exclude O,I.
+  }
+}
+
 // Returns the rank of a tensor with 'num_spatial_dims' spatial dimensions and
 // tensor format 'format'. This is the inverse of GetTensorSpatialDims.
 inline int GetTensorDimsFromSpatialDims(int num_spatial_dims,
@@ -65,6 +115,17 @@ inline int GetTensorDimsFromSpatialDims(int num_spatial_dims,
   }
 }
 
+// Returns the rank of a tensor with 'num_spatial_dims' spatial dimensions and
+// filter tensor format 'format'.
+inline int GetFilterTensorDimsFromSpatialDims(int num_spatial_dims,
+                                              FilterTensorFormat format) {
+  if (format == FORMAT_OIHW_VECT_I) {
+    return num_spatial_dims + 3;  // Include O,I,InnerI.
+  } else {
+    return num_spatial_dims + 2;  // Include O,I.
+  }
+}
+
 // Returns the index of the batch dimension.
 inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
   switch (format) {
@@ -79,7 +140,8 @@ inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
 }
 
 // Returns the index of the feature dimension. If format is NCHW_VECT_C, returns
-// the outer feature map count -- the size of the second dimension.
+// the index of the outer feature dimension (i.e. dimension 1, whose size would
+// be num_features / 4 in this case).
 inline int GetTensorFeatureDimIndex(int num_dims, TensorFormat format) {
   switch (format) {
     case FORMAT_NHWC:
@@ -95,7 +157,8 @@ inline int GetTensorFeatureDimIndex(int num_dims, TensorFormat format) {
 
 // Returns the index of the inner feature dimension.
 inline int GetTensorInnerFeatureDimIndex(int num_dims, TensorFormat format) {
-  return format == FORMAT_NCHW_VECT_C ? num_dims - 1 : -1;
+  DCHECK_EQ(format, FORMAT_NCHW_VECT_C);
+  return num_dims - 1;
 }
 
 // Returns the index of the `dim`-th spatial dimension.
@@ -115,48 +178,97 @@ inline int GetTensorSpatialDimIndex(int num_dims, TensorFormat format,
   }
 }
 
-// Return the position index from a format given a dimension specification with
-// a char. The chars can be N (batch), C (channels), H (y), W (x), or
-// 0 .. (NDIMS-1). If format is NCHW_VECT_C and dimension is C, returns the
-// outer feature map count -- the size of the second dimension.
-template <int NDIMS>
+// Returns the index of the `dim`-th spatial dimension.
+inline int GetFilterTensorSpatialDimIndex(int num_dims,
+                                          FilterTensorFormat format, int dim) {
+  CHECK(dim >= 0 && dim < GetFilterTensorSpatialDims(num_dims, format))
+      << dim << " " << num_dims << " " << ToString(format);
+  switch (format) {
+    case FORMAT_HWIO:
+      return dim;
+    case FORMAT_OIHW:
+    case FORMAT_OIHW_VECT_I:
+      return dim + 2;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the inner input channels dimension.
+inline int GetFilterTensorInnerInputChannelsDimIndex(
+    int num_dims, FilterTensorFormat format) {
+  DCHECK_EQ(format, FORMAT_OIHW_VECT_I);
+  return num_dims - 1;
+}
+
+// Returns the index of the input channels dimension.
+// If 'format' is FORMAT_OIHW_VECT_I, returns the dimension index of the
+// outer input channel (i.e. 1), which holds num_input_channels / 4.
+inline int GetFilterTensorInputChannelsDimIndex(int num_dims,
+                                                FilterTensorFormat format) {
+  switch (format) {
+    case FORMAT_HWIO:
+      return num_dims - 2;
+    case FORMAT_OIHW:
+    case FORMAT_OIHW_VECT_I:
+      return 1;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// Returns the index of the output channels dimension.
+inline int GetFilterTensorOutputChannelsDimIndex(int num_dims,
+                                                 FilterTensorFormat format) {
+  switch (format) {
+    case FORMAT_HWIO:
+      return num_dims - 1;
+    case FORMAT_OIHW:
+    case FORMAT_OIHW_VECT_I:
+      return 0;
+    default:
+      LOG(FATAL) << "Unknown format " << format;
+      return -1;  // Avoid compiler warning about missing return value
+  }
+}
+
+// TODO(pauldonnelly): Replace these tensor dimension index functions with
+// constant structs to improve performance and reduce code size in Compute()
+// functions.
+
+// Return the dimension index for the specified 'dimension' of the specified
+// data 'tensor_format'.  'dimension' is a char that can be 'N' (batch size),
+// 'C' (channels), 'H' (height), 'W' (width),  or a numbered spatial dimension:
+// '0',  .. (NUM_SPATIAL_DIMS-1)..
+// If 'format' is NCHW_VECT_C and 'dimension' is 'C', returns the index of
+// the outer channel dimension (i.e. 1).
+template <int NUM_SPATIAL_DIMS>
 inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
   if (format == FORMAT_NHWC) {
+    // clang-format off
     switch (dimension) {
-      case 'N':
-        return 0;
-      case '0':
-        return 1;
-      case '1':
-        return 2;
-      case '2':
-        return 3;
-      case 'H':
-        return NDIMS - 1;
-      case 'W':
-        return NDIMS;
-      case 'C':
-        return 1 + NDIMS;
+      case 'N': return 0;
+      case '0': return 1;
+      case '1': return 2;
+      case '2': return 3;
+      case 'H': return NUM_SPATIAL_DIMS - 1;
+      case 'W': return NUM_SPATIAL_DIMS;
+      case 'C': return NUM_SPATIAL_DIMS + 1;
       default:
         LOG(FATAL) << "Invalid dimension: " << dimension;
         return -1;  // Avoid compiler warning about missing return value
     }
   } else if (format == FORMAT_NCHW || format == FORMAT_NCHW_VECT_C) {
     switch (dimension) {
-      case 'N':
-        return 0;
-      case 'C':
-        return 1;
-      case '0':
-        return 2;
-      case '1':
-        return 3;
-      case '2':
-        return 4;
-      case 'H':
-        return NDIMS;
-      case 'W':
-        return NDIMS + 1;
+      case 'N': return 0;
+      case 'C': return 1;
+      case '0': return 2;
+      case '1': return 3;
+      case '2': return 4;
+      case 'H': return NUM_SPATIAL_DIMS;
+      case 'W': return NUM_SPATIAL_DIMS + 1;
       default:
         LOG(FATAL) << "Invalid dimension: " << dimension;
         return -1;  // Avoid compiler warning about missing return value
@@ -165,26 +277,85 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
     LOG(FATAL) << "Invalid format: " << static_cast<int>(format);
     return -1;  // Avoid compiler warning about missing return value
   }
+  // clang-format on
+}
+
+// Return the dimension index for the specified 'dimension' of the specified
+// 'filter_tensor_format'.  'dimension' is a char that can be 'O' (num output
+// channels), 'I' (num input channels), 'H' (height), 'W' (width), or a
+// numbered spatial dimension: '0',  .. (NUM_SPATIAL_DIMS-1).
+// If 'format' is OIHW_VECT_I and 'dimension' is 'I', returns the index of the
+// outer input channels dimension (i.e. 1).
+template <int NUM_SPATIAL_DIMS>
+inline int GetFilterDimIndex(FilterTensorFormat filter_tensor_format,
+                             char dimension) {
+  // clang-format off
+  if (filter_tensor_format == FORMAT_HWIO) {
+    switch (dimension) {
+      case '0': return 0;
+      case '1': return 1;
+      case '2': return 2;
+      case 'H': return NUM_SPATIAL_DIMS - 2;
+      case 'W': return NUM_SPATIAL_DIMS - 1;
+      case 'I': return NUM_SPATIAL_DIMS;
+      case 'O': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else if (filter_tensor_format == FORMAT_OIHW ||
+             filter_tensor_format == FORMAT_OIHW_VECT_I) {
+    switch (dimension) {
+      case 'O': return 0;
+      case 'I': return 1;
+      case '0': return 2;
+      case '1': return 3;
+      case '2': return 4;
+      case 'H': return NUM_SPATIAL_DIMS;
+      case 'W': return NUM_SPATIAL_DIMS + 1;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  } else {
+    LOG(FATAL) << "Invalid format: " << static_cast<int>(filter_tensor_format);
+    return -1;  // Avoid compiler warning about missing return value
+  }
+  // clang-format on
 }
 
 inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
   return GetTensorDimIndex<2>(format, dimension);
 }
 
-// Return the given tensor dimension from a vector that represents the
-// dimensions of a tensor.
-// The tensor is interpretted using the specified format, and a dimension
-// specification using a char.
+// Return the element from 'dimension_attributes' that corresponds to the
+// specified 'dimension' according to 'tensor_format'.
 template <typename T>
-T GetTensorDim(gtl::ArraySlice<T> attributes, TensorFormat format,
-               char dimension) {
-  int index = (GetTensorSpatialDims(attributes.size(), format) == 3)
-                  ? GetTensorDimIndex<3>(format, dimension)
-                  : GetTensorDimIndex<2>(format, dimension);
-  CHECK(index >= 0 && index < attributes.size())
-      << "Invalid index from the dimension: " << index << ", " << format << ", "
-      << dimension;
-  return attributes[index];
+T GetTensorDim(gtl::ArraySlice<T> dimension_attributes,
+               TensorFormat tensor_format, char dimension) {
+  int index =
+      (GetTensorSpatialDims(dimension_attributes.size(), tensor_format) == 3)
+          ? GetTensorDimIndex<3>(tensor_format, dimension)
+          : GetTensorDimIndex<2>(tensor_format, dimension);
+  CHECK(index >= 0 && index < dimension_attributes.size())
+      << "Invalid index from the dimension: " << index << ", " << tensor_format
+      << ", " << dimension;
+  return dimension_attributes[index];
+}
+
+// Return the element from 'dimension_attribute' that corresponds to the
+// specified 'dimension' according to 'filter_tensor_format'.
+template <typename T>
+T GetFilterDim(gtl::ArraySlice<T> dimension_attribute,
+               FilterTensorFormat filter_tensor_format, char dimension) {
+  int index = (GetFilterTensorSpatialDims(dimension_attribute.size(),
+                                          filter_tensor_format) == 3)
+                  ? GetFilterDimIndex<3>(filter_tensor_format, dimension)
+                  : GetFilterDimIndex<2>(filter_tensor_format, dimension);
+  CHECK(index >= 0 && index < dimension_attribute.size())
+      << "Invalid index from the dimension: " << index << ", "
+      << filter_tensor_format << ", " << dimension;
+  return dimension_attribute[index];
 }
 
 template <typename T>
@@ -193,35 +364,53 @@ T GetTensorDim(const std::vector<T>& attributes, TensorFormat format,
   return GetTensorDim(gtl::ArraySlice<T>(attributes), format, dimension);
 }
 
-// Return the given tensor dimension from a tensor shape.
-// The tensor is interpretted using the specified format, and a dimension
-// specification using a char.
-inline int64 GetTensorDim(const TensorShape& tensor_shape, TensorFormat format,
-                          char dimension) {
-  return GetTensorDim(gtl::ArraySlice<int64>(tensor_shape.dim_sizes()), format,
-                      dimension);
+// Return the size of the specified 'dimension' within 'tensor_shape'
+// according to 'tensor_format'.
+inline int64 GetTensorDim(const TensorShape& tensor_shape,
+                          TensorFormat tensor_format, char dimension) {
+  return GetTensorDim(gtl::ArraySlice<int64>(tensor_shape.dim_sizes()),
+                      tensor_format, dimension);
 }
 
-// Return the given tensor dimension from a tensor. The tensor is interpretted
-// using the specified format, and a dimension specification using a char.
-inline int64 GetTensorDim(const Tensor& tensor, TensorFormat format,
+// Return the size of the specified 'dimension' within 'tensor_shape'
+// according to 'tensor_filter_format'.
+inline int64 GetFilterDim(const TensorShape& tensor_shape,
+                          FilterTensorFormat tensor_filter_format,
                           char dimension) {
-  return GetTensorDim(tensor.shape(), format, dimension);
+  return GetFilterDim(gtl::ArraySlice<int64>(tensor_shape.dim_sizes()),
+                      tensor_filter_format, dimension);
+}
+
+// Return the size of the specified 'dimension' of 'tensor' according to
+// 'tensor_format'.
+inline int64 GetTensorDim(const Tensor& tensor, TensorFormat tensor_format,
+                          char dimension) {
+  return GetTensorDim(tensor.shape(), tensor_format, dimension);
+}
+
+// Return the size of the specified 'dimension' of 'tensor' according to
+// 'filter_tensor_format'.
+inline int64 GetFilterDim(const Tensor& tensor,
+                          FilterTensorFormat filter_tensor_format,
+                          char dimension) {
+  return GetFilterDim(tensor.shape(), filter_tensor_format, dimension);
 }
 
 // Return the string that specifies the data format for convnet operations.
 string GetConvnetDataFormatAttrString();
 string GetConvnet3dDataFormatAttrString();
 
+// Return the string that specifies the filter format for convnet operations.
+string GetConvnetFilterFormatAttrString();
+string GetConvnet3dFilterFormatAttrString();
+
 // Return a tensor shape for the given format. Works for both 2D and 3D
 // operations. If format is FORMAT_NCHW_VECT_C, the output TensorShape has rank
 // spatial.size()+3 (N,C,spatial,InnerC); otherwise, it has rank
 // spatial.size()+2 (e.g. N,C,spatial or N,spatial,C).
 inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
                                    gtl::ArraySlice<int64> spatial, int64 C) {
-  const int dims =
-      spatial.size() + (format == FORMAT_NCHW_VECT_C ? 3  // Include N,C,InnerC.
-                                                     : 2);  // Include N,C.
+  const int dims = GetTensorDimsFromSpatialDims(spatial.size(), format);
   gtl::InlinedVector<int64, 6> dim_sizes(dims);
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
@@ -240,13 +429,45 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
   return TensorShape(dim_sizes);
 }
 
-// Return a tensor shape from the given format, and tensor dimensions.
+// Return a tensor shape of the specified 'format', and dimensions.
+// Works for both 2D and 3D operations. If 'format' is OIHW_VECT_I,
+// the output TensorShape has spatial.size() + 3 dimensions, otherwise
+// it has spatial.size() + 2 dimensions.
+inline TensorShape ShapeFromFilterTensorFormat(FilterTensorFormat format,
+                                               gtl::ArraySlice<int64> spatial,
+                                               int64 I, int64 O) {
+  const int dims = GetFilterTensorDimsFromSpatialDims(spatial.size(), format);
+  gtl::InlinedVector<int64, 6> dim_sizes(dims);
+  dim_sizes[GetFilterTensorOutputChannelsDimIndex(dims, format)] = O;
+  for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
+    dim_sizes[GetFilterTensorSpatialDimIndex(dims, format, dim)] = spatial[dim];
+  }
+
+  if (format == FORMAT_OIHW_VECT_I) {
+    CHECK_EQ(0, I % 4) << "OIHW_VECT_I requires I to be a multiple of 4, but I="
+                       << I;
+    I /= 4;
+    dim_sizes[GetFilterTensorInnerInputChannelsDimIndex(dims, format)] = 4;
+  }
+  dim_sizes[GetFilterTensorInputChannelsDimIndex(dims, format)] = I;
+  return TensorShape(dim_sizes);
+}
+
+// Return a tensor shape of the specified 'format', and dimensions.
 inline TensorShape ShapeFromFormat(TensorFormat format, int64 N, int64 H,
                                    int64 W, int64 C) {
   return ShapeFromFormat(format, N, {H, W}, C);
 }
 
-// Return a tensor shape from the given format, and tensor dimensions.
+// Return a filter tensor shape of the specified 'format', and dimensions.
+inline TensorShape ShapeFromFilterTensorFormat(FilterTensorFormat format,
+                                               int64 H, int64 W, int64 I,
+                                               int64 O) {
+  return ShapeFromFilterTensorFormat(format, {H, W}, I, O);
+}
+
+// Returns a copy of the specified tensor 'src_shape' converted from
+// 'src_format' to 'dst_format'.
 inline TensorShape ShapeFromFormat(TensorFormat dst_format,
                                    const TensorShape& src_shape,
                                    TensorFormat src_format) {
@@ -272,6 +493,36 @@ inline TensorShape ShapeFromFormat(TensorFormat dst_format,
                          channels);
 }
 
+// Returns a copy of the specified filter tensor 'src_shape' converted from
+// 'src_filter_format' to 'dst_filter_format'.
+inline TensorShape ShapeFromFilterFormat(FilterTensorFormat dst_filter_format,
+                                         const TensorShape& src_shape,
+                                         FilterTensorFormat src_filter_format) {
+  if (src_filter_format == dst_filter_format) {
+    return src_shape;
+  }
+
+  const int64 output_channels = GetFilterDim(src_shape, src_filter_format, 'O');
+  const int64 input_channels =
+      GetFilterDim(src_shape, src_filter_format, 'I') *
+      (src_filter_format == FORMAT_OIHW_VECT_I ? 4 : 1);
+
+  if (GetFilterTensorSpatialDims(src_shape.dims(), src_filter_format) == 3) {
+    return ShapeFromFilterTensorFormat(
+        dst_filter_format,
+        {{GetFilterDim(src_shape, src_filter_format, '0'),
+          GetFilterDim(src_shape, src_filter_format, '1'),
+          GetFilterDim(src_shape, src_filter_format, '2')}},
+        input_channels, output_channels);
+  }
+
+  return ShapeFromFilterTensorFormat(
+      dst_filter_format,
+      {{GetFilterDim(src_shape, src_filter_format, 'H'),
+        GetFilterDim(src_shape, src_filter_format, 'W')}},
+      input_channels, output_channels);
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_TENSOR_FORMAT_H_
diff --git a/tensorflow/core/util/tensor_format_test.cc b/tensorflow/core/util/tensor_format_test.cc
new file mode 100644
index 00000000000..36698e03831
--- /dev/null
+++ b/tensorflow/core/util/tensor_format_test.cc
@@ -0,0 +1,198 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+#define EnumStringPair(val) \
+  { val, #val }
+
+std::pair<TensorFormat, const char*> test_data_formats[] = {
+    EnumStringPair(FORMAT_NHWC),
+    EnumStringPair(FORMAT_NCHW),
+    EnumStringPair(FORMAT_NCHW_VECT_C),
+};
+
+std::pair<FilterTensorFormat, const char*> test_filter_formats[] = {
+    EnumStringPair(FORMAT_HWIO),
+    EnumStringPair(FORMAT_OIHW),
+    EnumStringPair(FORMAT_OIHW_VECT_I),
+};
+
+// This is an alternative way of specifying the tensor dimension indexes for
+// each tensor format. For now it can be used as a cross-check of the existing
+// functions, but later could replace them.
+
+// Represents the dimension indexes of an activations tensor format.
+struct TensorDimMap {
+  int n() const { return dim_n; }
+  int h() const { return dim_h; }
+  int w() const { return dim_w; }
+  int c() const { return dim_c; }
+  int spatial(int spatial_index) const { return spatial_dim[spatial_index]; }
+
+  int dim_n, dim_h, dim_w, dim_c;
+  int spatial_dim[3];
+};
+
+// Represents the dimension indexes of a filter tensor format.
+struct FilterDimMap {
+  int h() const { return dim_h; }
+  int w() const { return dim_w; }
+  int i() const { return dim_i; }
+  int o() const { return dim_o; }
+  int spatial(int spatial_index) const { return spatial_dim[spatial_index]; }
+
+  int dim_h, dim_w, dim_i, dim_o;
+  int spatial_dim[3];
+};
+
+// clang-format off
+
+// Predefined constants specifying the actual dimension indexes for each
+// supported tensor and filter format.
+struct DimMaps {
+#define StaCoExTensorDm static constexpr TensorDimMap
+  //                                'N', 'H', 'W', 'C'    0,  1,  2
+  StaCoExTensorDm kTdmInvalid =   { -1,  -1,  -1,  -1, { -1, -1, -1 } };
+  // These arrays are indexed by the number of spatial dimensions in the format.
+  StaCoExTensorDm kTdmNHWC[4] = { kTdmInvalid,
+                                  {  0,  -1,   1,   2, {  1, -1, -1 } },  // 1D
+                                  {  0,   1,   2,   3, {  1,  2, -1 } },  // 2D
+                                  {  0,   2,   3,   4, {  1,  2,  3 } }   // 3D
+                                };
+  StaCoExTensorDm kTdmNCHW[4] = { kTdmInvalid,
+                                  {  0,  -1,   2,   1, {  2, -1, -1 } },
+                                  {  0,   2,   3,   1, {  2,  3, -1 } },
+                                  {  0,   3,   4,   1, {  2,  3,  4 } }
+                                };
+#undef StaCoExTensorDm
+#define StaCoExFilterDm static constexpr FilterDimMap
+  //                                'H', 'W', 'I', 'O'    0   1   2
+  StaCoExFilterDm kFdmInvalid =   { -1,  -1,  -1,  -1, { -1, -1, -1 } };
+  StaCoExFilterDm kFdmHWIO[4] = { kFdmInvalid,
+                                  { -1,   0,   1,   2, {  0, -1, -1 } },
+                                  {  0,   1,   2,   3, {  0,  1, -1 } },
+                                  {  1,   2,   3,   4, {  0,  1,  2 } }
+                                };
+  StaCoExFilterDm kFdmOIHW[4] = { kFdmInvalid,
+                                  { -1,   2,   1,   0, {  2, -1, -1 } },
+                                  {  2,   3,   1,   0, {  2,  3, -1 } },
+                                  {  3,   4,   1,   0, {  2,  3,  4 } }
+                                };
+#undef StaCoExFilterDm
+};
+
+inline constexpr const TensorDimMap&
+GetTensorDimMap(const int num_spatial_dims, const TensorFormat format) {
+  return
+      (format == FORMAT_NHWC) ? DimMaps::kTdmNHWC[num_spatial_dims] :
+      (format == FORMAT_NCHW ||
+       format == FORMAT_NCHW_VECT_C) ? DimMaps::kTdmNCHW[num_spatial_dims]
+                                     : DimMaps::kTdmInvalid;
+}
+
+inline constexpr const FilterDimMap&
+GetFilterDimMap(const int num_spatial_dims,
+                const FilterTensorFormat format) {
+  return
+      (format == FORMAT_HWIO) ? DimMaps::kFdmHWIO[num_spatial_dims] :
+      (format == FORMAT_OIHW ||
+       format == FORMAT_OIHW_VECT_I) ? DimMaps::kFdmOIHW[num_spatial_dims]
+                                     : DimMaps::kFdmInvalid;
+}
+// clang-format on
+
+constexpr TensorDimMap DimMaps::kTdmInvalid;
+constexpr TensorDimMap DimMaps::kTdmNHWC[4];
+constexpr TensorDimMap DimMaps::kTdmNCHW[4];
+constexpr FilterDimMap DimMaps::kFdmInvalid;
+constexpr FilterDimMap DimMaps::kFdmHWIO[4];
+constexpr FilterDimMap DimMaps::kFdmOIHW[4];
+
+TEST(TensorFormatTest, FormatEnumsAndStrings) {
+  const string prefix = "FORMAT_";
+  for (auto& test_data_format : test_data_formats) {
+    const char* stringified_format_enum = test_data_format.second;
+    LOG(INFO) << stringified_format_enum << " = " << test_data_format.first;
+    string expected_format_str = &stringified_format_enum[prefix.size()];
+    TensorFormat format;
+    EXPECT_TRUE(FormatFromString(expected_format_str, &format));
+    string format_str = ToString(format);
+    EXPECT_EQ(expected_format_str, format_str);
+    EXPECT_EQ(test_data_format.first, format);
+  }
+  for (auto& test_filter_format : test_filter_formats) {
+    const char* stringified_format_enum = test_filter_format.second;
+    LOG(INFO) << stringified_format_enum << " = " << test_filter_format.first;
+    string expected_format_str = &stringified_format_enum[prefix.size()];
+    FilterTensorFormat format;
+    EXPECT_TRUE(FilterFormatFromString(expected_format_str, &format));
+    string format_str = ToString(format);
+    EXPECT_EQ(expected_format_str, format_str);
+    EXPECT_EQ(test_filter_format.first, format);
+  }
+}
+
+template <int num_spatial_dims>
+void RunDimensionIndexesTest() {
+  for (auto& test_data_format : test_data_formats) {
+    TensorFormat format = test_data_format.first;
+    auto& tdm = GetTensorDimMap(num_spatial_dims, format);
+    int num_dims = GetTensorDimsFromSpatialDims(num_spatial_dims, format);
+    LOG(INFO) << ToString(format) << ", num_spatial_dims=" << num_spatial_dims
+              << ", num_dims=" << num_dims;
+    EXPECT_EQ(GetTensorBatchDimIndex(num_dims, format), tdm.n());
+    EXPECT_EQ(GetTensorDimIndex<num_spatial_dims>(format, 'N'), tdm.n());
+    EXPECT_EQ(GetTensorFeatureDimIndex(num_dims, format), tdm.c());
+    EXPECT_EQ(GetTensorDimIndex<num_spatial_dims>(format, 'C'), tdm.c());
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      EXPECT_EQ(GetTensorSpatialDimIndex(num_dims, format, i), tdm.spatial(i));
+      EXPECT_EQ(GetTensorDimIndex<num_spatial_dims>(format, '0' + i),
+                tdm.spatial(i));
+    }
+  }
+  for (auto& test_filter_format : test_filter_formats) {
+    FilterTensorFormat format = test_filter_format.first;
+    auto& fdm = GetFilterDimMap(num_spatial_dims, format);
+    int num_dims = GetFilterTensorDimsFromSpatialDims(num_spatial_dims, format);
+    LOG(INFO) << ToString(format) << ", num_spatial_dims=" << num_spatial_dims
+              << ", num_dims=" << num_dims;
+    EXPECT_EQ(GetFilterTensorOutputChannelsDimIndex(num_dims, format), fdm.o());
+    EXPECT_EQ(GetFilterDimIndex<num_spatial_dims>(format, 'O'), fdm.o());
+    EXPECT_EQ(GetFilterTensorInputChannelsDimIndex(num_dims, format), fdm.i());
+    EXPECT_EQ(GetFilterDimIndex<num_spatial_dims>(format, 'I'), fdm.i());
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      EXPECT_EQ(GetFilterTensorSpatialDimIndex(num_dims, format, i),
+                fdm.spatial(i));
+      EXPECT_EQ(GetFilterDimIndex<num_spatial_dims>(format, '0' + i),
+                fdm.spatial(i));
+    }
+  }
+}
+
+TEST(TensorFormatTest, DimensionIndexes) {
+  RunDimensionIndexesTest<1>();
+  RunDimensionIndexesTest<2>();
+  RunDimensionIndexesTest<3>();
+}
+
+}  // namespace tensorflow

From 8454e3ffa850527d6fecefb3110e7b1a4b6939f5 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 18 Aug 2017 17:34:57 -0700
Subject: [PATCH 49/70] Added the ability to load grappler items from a the
 metagraphdef files contained in a TensorFlow checkpoint directory.

PiperOrigin-RevId: 165774826
---
 tensorflow/contrib/makefile/Makefile          |   1 +
 tensorflow/core/grappler/inputs/BUILD         |  17 +++
 .../grappler/inputs/file_input_yielder.cc     | 134 ++++++++++++++++++
 .../core/grappler/inputs/file_input_yielder.h |  56 ++++++++
 4 files changed, 208 insertions(+)
 create mode 100644 tensorflow/core/grappler/inputs/file_input_yielder.cc
 create mode 100644 tensorflow/core/grappler/inputs/file_input_yielder.h

diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index f8837e3f586..98af47d7288 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -505,6 +505,7 @@ $(wildcard tensorflow/core/user_ops/*.cu.cc) \
 $(wildcard tensorflow/core/common_runtime/gpu/*) \
 $(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \
 $(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \
+$(wildcard tensorflow/core/grappler/inputs/file_input_yielder.*) \
 $(wildcard tensorflow/core/grappler/clusters/single_machine.*)
 # Filter out all the excluded files.
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 5c70f409697..915a3e28f88 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -66,3 +66,20 @@ cc_library(
         "//tensorflow/core/kernels:aggregate_ops",
     ],
 )
+
+cc_library(
+    name = "file_input_yielder",
+    srcs = ["file_input_yielder.cc"],
+    hdrs = [
+        "file_input_yielder.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":input_yielder",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.cc b/tensorflow/core/grappler/inputs/file_input_yielder.cc
new file mode 100644
index 00000000000..e63a38c9746
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.cc
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/inputs/file_input_yielder.h"
+
+#include <memory>
+#include <unordered_set>
+#include <utility>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+FileInputYielder::FileInputYielder(const std::vector<string>& filenames,
+                                   size_t max_iterations)
+    : filenames_(filenames),
+      current_file_(0),
+      current_iteration_(0),
+      max_iterations_(max_iterations),
+      bad_inputs_(0) {
+  CHECK_GT(filenames.size(), 0) << "List of filenames is empty.";
+}
+
+bool FileInputYielder::NextItem(GrapplerItem* item) {
+  if (filenames_.size() == bad_inputs_) {
+    // All the input files are bad, give up.
+    return false;
+  }
+
+  if (current_file_ >= filenames_.size()) {
+    if (current_iteration_ >= max_iterations_) {
+      return false;
+    } else {
+      ++current_iteration_;
+      current_file_ = 0;
+      bad_inputs_ = 0;
+    }
+  }
+
+  const string& filename = filenames_[current_file_];
+  ++current_file_;
+
+  if (!Env::Default()->FileExists(filename).ok()) {
+    LOG(WARNING) << "Skipping non existent file " << filename;
+    // Attempt to process the next item on the list
+    bad_inputs_ += 1;
+    return NextItem(item);
+  }
+
+  LOG(INFO) << "Loading model from " << filename;
+
+  MetaGraphDef metagraph;
+  Status s = ReadBinaryProto(Env::Default(), filename, &metagraph);
+  if (!s.ok()) {
+    s = ReadTextProto(Env::Default(), filename, &metagraph);
+  }
+  if (!s.ok()) {
+    LOG(WARNING) << "Failed to read MetaGraphDef from " << filename << ": "
+                 << s.ToString();
+    // Attempt to process the next item on the list
+    bad_inputs_ += 1;
+    return NextItem(item);
+  }
+
+  if (metagraph.collection_def().count("train_op") == 0 ||
+      !metagraph.collection_def().at("train_op").has_node_list() ||
+      metagraph.collection_def().at("train_op").node_list().value_size() == 0) {
+    LOG(ERROR) << "No train op specified";
+    bad_inputs_ += 1;
+    metagraph = MetaGraphDef();
+    return NextItem(item);
+  } else {
+    std::unordered_set<string> train_ops;
+    for (const string& val :
+         metagraph.collection_def().at("train_op").node_list().value()) {
+      train_ops.insert(NodeName(val));
+    }
+    std::unordered_set<string> train_ops_found;
+    for (auto& node : metagraph.graph_def().node()) {
+      if (train_ops.find(node.name()) != train_ops.end()) {
+        train_ops_found.insert(node.name());
+      }
+    }
+    if (train_ops_found.size() != train_ops.size()) {
+      for (const auto& train_op : train_ops) {
+        if (train_ops_found.find(train_op) != train_ops_found.end()) {
+          LOG(ERROR) << "Non existent train op specified: " << train_op;
+        }
+      }
+      bad_inputs_ += 1;
+      metagraph = MetaGraphDef();
+      return NextItem(item);
+    }
+  }
+
+  const string id =
+      strings::StrCat(Fingerprint64(metagraph.SerializeAsString()));
+
+  ItemConfig cfg;
+  std::unique_ptr<GrapplerItem> new_item =
+      GrapplerItemFromMetaGraphDef(id, metagraph, cfg);
+  if (new_item == nullptr) {
+    bad_inputs_ += 1;
+    metagraph = MetaGraphDef();
+    return NextItem(item);
+  }
+
+  *item = std::move(*new_item);
+  return true;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
new file mode 100644
index 00000000000..a17e1c9ff2a
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The file input provides a mechanism to feed grappler with existing TensorFlow
+// graphs stored in TensorFlow checkpoints. Note that at this point the weights
+// that may be stored in the checkpoint are not restored in order to speedup the
+// initialization.
+
+#ifndef LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+
+#include <stddef.h>
+#include <limits>
+#include <vector>
+#include "tensorflow/core/grappler/inputs/input_yielder.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GrapplerItem;
+
+class FileInputYielder : public InputYielder {
+ public:
+  // Iterates over the files specified in the list of 'filename' up to
+  // 'max_iterations' times.
+  explicit FileInputYielder(
+      const std::vector<string>& filenames,
+      size_t max_iterations = std::numeric_limits<size_t>::max());
+  bool NextItem(GrapplerItem* item) override;
+
+ private:
+  const std::vector<string> filenames_;
+  size_t current_file_;
+  size_t current_iteration_;
+  size_t max_iterations_;
+
+  size_t bad_inputs_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_

From 4bf2c00298a31957070816bdf54dc045d7d2247f Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Fri, 18 Aug 2017 18:17:48 -0700
Subject: [PATCH 50/70] TFE: Add execution callback

PiperOrigin-RevId: 165778176
---
 tensorflow/python/eager/BUILD                 |  13 ++
 tensorflow/python/eager/context.py            |  66 ++++++
 tensorflow/python/eager/execute.py            |   7 +
 .../python/eager/execution_callbacks.py       | 198 ++++++++++++++++++
 4 files changed, 284 insertions(+)
 create mode 100644 tensorflow/python/eager/execution_callbacks.py

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index e865848baa6..a442cbff4bd 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -127,6 +127,7 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
+        ":core",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -150,6 +151,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "execution_callbacks",
+    srcs = ["execution_callbacks.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//third_party/py/numpy",
+    ],
+)
+
 cc_library(
     name = "python_eager_op_gen",
     srcs = ["python_eager_op_gen.cc"],
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 725fc2530ad..35e4f1469fe 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -66,6 +66,7 @@ class Context(object):
     self._context_handle = None
     self._context_devices = None
     self._summary_writer_resource = None
+    self._post_execution_callbacks = []
     self._config = config
     self._initialize_lock = threading.Lock()
 
@@ -237,6 +238,40 @@ class Context(object):
     # TODO(ashankar): Use TF_DeviceListType to count GPU devices.
     return len(self._devices) - 1
 
+  def add_post_execution_callback(self, callback):
+    """Add a post-execution callback to the context.
+
+    A post-execution callback is invoked immediately after an eager operation or
+    function has finished execution, providing access to the op's type, name
+    input and output tensors. Multiple execution callbacks can be added, in
+    which case the callbacks will be invoked in the order in which they are
+    added.
+
+    Args:
+      callback: a callable of the signature
+      `f(op_type, op_name, attrs, inputs, outputs)`.
+      `op_type` is the type of the operation that was just executed (e.g.,
+        `MatMul`).
+      `op_name` is the name of the operation that has was just executed. This
+        name is set by the client who created the operation and can be `None` if
+        it is unset.
+      `attrs` contains the attributes of the operation as a `tuple` of
+        alternating attribute names and attribute values.
+      `inputs` is the `list` of input `tfe.Tensor`(s) to the op.
+      `outputs` is the `list` of output `tfe.Tensor`(s) from the op.
+       Return value(s) from the callback are ignored.
+    """
+    # TODO(cais): (b/64674139) Allow access to function-internal operations.
+    self._post_execution_callbacks.append(callback)
+
+  def clear_post_execution_callbacks(self):
+    """Clear all post-execution callbacks added to the context."""
+    del self._post_execution_callbacks[:]
+
+  @property
+  def post_execution_callbacks(self):
+    """Get the list of post-execution callbacks added to the context."""
+    return self._post_execution_callbacks
 
 _context = None
 _context_lock = threading.Lock()
@@ -365,3 +400,34 @@ def enable_eager_execution():
   global _default_mode
   assert _default_mode == GRAPH_MODE
   _default_mode = EAGER_MODE
+
+
+def add_execution_callback(callback):
+  """Add an execution callback to the default eager context.
+
+  An execution callback is invoked immediately after an eager operation or
+  function has finished execution, providing access to the op's type, name
+  input and output tensors. Multiple execution callbacks can be added, in
+  which case the callbacks will be invoked in the order in which they are
+  added.
+
+  Args:
+    callback: a callable of the signature
+      `f(op_type, op_name, attrs, inputs, outputs)`.
+      `op_type` is the type of the operation that was just executed (e.g.,
+        `MatMul`).
+      `op_name` is the name of the operation that has was just executed. This
+        name is set by the client who created the operation and can be `None` if
+        it is unset.
+      `attrs` contains the attributes of the operation as a `tuple` of
+        alternating attribute name and attribute value.
+      `inputs` is the `list` of input `tfe.Tensor`(s) to the op.
+      `outputs` is the `list` of output `tfe.Tensor`(s) from the op.
+       Return value(s) from the callback are ignored.
+  """
+  get_default_context().add_post_execution_callback(callback)
+
+
+def clear_execution_callbacks():
+  """Clear all execution callbacks from the default eager context."""
+  get_default_context().clear_post_execution_callbacks()
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 6452dd4d4d9..2223e9833af 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -75,6 +75,7 @@ def execute(op_name, num_outputs, inputs, attrs=None, name=None):
   # pylint: enable=protected-access
 
   tensors = [tensor._tensor_from_handle(x) for x in outh]  # pylint: disable=protected-access
+  # TODO(alive, cais): Use the execution callback mechanism.
   if core.active_trace() is not None:
     trace_name = name if name else op_name
     for t in tensors:
@@ -84,6 +85,12 @@ def execute(op_name, num_outputs, inputs, attrs=None, name=None):
                                         t._device_name(),
                                         t.shape.num_elements())
       # pylint: enable=protected-access
+
+  # TODO(cais): Optimize this, perhaps by replacing this execute function with
+  # a different one when there are execution callback(s).
+  for callback in ctx.post_execution_callbacks:
+    callback(op_name, name, attrs, inputs, tensors)
+
   return tensors
 
 
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
new file mode 100644
index 00000000000..fa2a054f3ca
--- /dev/null
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -0,0 +1,198 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Execution Callbacks for Eager Mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
+from tensorflow.python.eager import core
+from tensorflow.python.platform import tf_logging as logging
+
+
+# TODO(cais): Consider moving this exception class to errors_impl.py.
+class InfOrNanError(Exception):
+  """Exception for inf and/or nan being present in tensor."""
+
+  def __init__(self,
+               op_type,
+               op_name,
+               output_index,
+               num_outputs,
+               value):
+    """Constructor of InfOrNanError.
+
+    Args:
+      op_type: Type name of the op that generated the tensor that generated the
+        `inf`(s) or `nan`(s) (e.g., `Div`).
+      op_name: Name of the op that generated the tensor with `inf`(s) or
+        `nan`(s). This name is set by client and can be `None` if it is unset.
+      output_index: The 0-based output index of the tensor that contains
+        `inf`(s) or `nan`(s).
+      num_outputs: Total number of outputs of the operation.
+      value: The tensor value that contains `inf`(s) or `nan`(s).
+    """
+    self._op_type = op_type
+    self._op_name = op_name
+    self._output_index = output_index
+    self._num_outputs = num_outputs
+    self._value = value
+
+    self._total_count = np.size(value)
+    self._inf_count = np.count_nonzero(np.isinf(value))
+    self._nan_count = np.count_nonzero(np.isnan(value))
+
+    super(InfOrNanError, self).__init__(self._get_error_message())
+
+  def _get_error_message(self):
+    """Get the error message describing this InfOrNanError object."""
+    name_str = (("'%s'" % self._op_name) if self._op_name is not None
+                else str(self._op_name))
+    msg = "Output %d of %d of TFE operation %s (name: %s) contains " % (
+        self._output_index + 1, self._num_outputs, self._op_type, name_str)
+    if self._inf_count and self._nan_count:
+      msg += "%d inf(s) and %d nan(s) " % (self._inf_count, self._nan_count)
+    elif self._inf_count:
+      msg += "%d inf(s) " % self._inf_count
+    else:
+      msg += "%d nan(s) " % self._nan_count
+    msg += "out of a total of %d element(s). Tensor value: %s" % (
+        self._total_count, self._value)
+    return msg
+
+  @property
+  def op_type(self):
+    return self._op_type
+
+  @property
+  def op_name(self):
+    return self._op_name
+
+  @property
+  def output_index(self):
+    return self._output_index
+
+  @property
+  def num_outputs(self):
+    return self._num_outputs
+
+  @property
+  def value(self):
+    return self._value
+
+
+def inf_nan_callback(op_type,
+                     op_name,
+                     attrs,
+                     inputs,
+                     outputs,
+                     check_inf=True,
+                     check_nan=True,
+                     action="raise"):
+  """An execution callback that checks for `inf`s and `nan`s in output tensors.
+
+  This callback can be used with `tfe.add_execute_callback` to check for invalid
+  numeric values. E.g.,
+  ```python
+  tfe.add_execute_callback(tfe.inf_nan_callback)
+  ```
+
+  Args:
+    op_type: Name of the TFE operation type (e.g., `MatMul`).
+    op_name: Name of the TFE operation. This name is set by client and can be
+      `None` if it unset.
+    attrs: Attributes of the TFE operation, as a tuple of alternating attribute
+      names and attribute values.
+    inputs: The `list` of input tensors to the operation, currently unused by
+      this callback.
+    outputs: The `list` of output tensors from the operation, checked by this
+      callback for `inf` and `nan` values.
+    check_inf: (`bool`) Whether this callback should check for `inf` values in
+      the output tensor values.
+    check_nan: (`bool`) Whether this callback should check for `nan` values in
+      the output tensor values.
+    action: (`str`) Action to be taken by the callback when `inf` or `nan`
+      values are detected. Possible values {"raise", "log", "print"}
+      `"raise"`: Raise a `InfOrNanError`.
+      `"log"`: Log a warning using `tf.logging.warn`.
+      `"print"`: Print a message to `sys.stdout`.
+
+  Raises:
+    InfOrNanError: iff `inf` or `nan` values are seen in any of `outputs` and
+      `action` is `"raise"`.
+    ValueError: iff the value of `action` is invalid.
+  """
+  del attrs, inputs  # Not used.
+
+  ctx = context.get_default_context()
+
+  for index, output in enumerate(outputs):
+    if not output.dtype.is_numpy_compatible:
+      continue
+
+    numpy_dtype = output.dtype.as_numpy_dtype
+    if (np.issubdtype(numpy_dtype, np.float) or
+        np.issubdtype(numpy_dtype, np.complex) or
+        np.issubdtype(numpy_dtype, np.integer)):
+      try:
+        check_numerics_op_attrs = (
+            "message", "Eager-mode inf/nan check",
+            "T", outputs[0].dtype.as_datatype_enum)
+        # TODO(cais): Consider moving this into execute.py.
+        # pylint: disable=protected-access
+        pywrap_tensorflow.TFE_Py_Execute(
+            ctx._handle, output.device, "CheckNumerics", [output._handle],
+            check_numerics_op_attrs, 1)
+        # pylint: enable=protected-access
+      except core._NotOkStatusException:  # pylint: disable=protected-access
+        value = output.numpy()
+        inf_detected = np.any(np.isinf(value)) and check_inf
+        nan_detected = np.any(np.isnan(value)) and check_nan
+        if not inf_detected and not nan_detected:
+          continue
+
+        error = InfOrNanError(op_type, op_name, index, len(outputs), value)
+        if action == "print":
+          print("Warning: %s" % str(error))
+        elif action == "log":
+          logging.warn(str(error))
+        elif action == "raise":
+          raise error
+        else:
+          raise ValueError(
+              "Invalid action for inf_nan_callback: %s. Valid actions are: "
+              "{print | log | raise}" % action)
+
+
+def inf_callback(op_type, op_name, attrs, inputs, outputs, action="raise"):
+  """A specialization of `inf_nan_callback` that checks for `inf`s only."""
+  inf_nan_callback(
+      op_type, op_name, attrs, inputs, outputs, check_inf=True, check_nan=False,
+      action=action)
+
+
+def nan_callback(op_type, op_name, attrs, inputs, outputs, action="raise"):
+  """A specialization of `inf_nan_callback` that checks for `nan`s only."""
+  inf_nan_callback(
+      op_type, op_name, attrs, inputs, outputs, check_inf=False, check_nan=True,
+      action=action)
+
+
+# TODO(cais): (b/64674139) Provide an alias, perhaps called seterr(), for
+# add_execute_callback(inf_nan_hook).

From da3aa93758508a37741dad1d8b28e62782330171 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 19:42:12 -0700
Subject: [PATCH 51/70] CPU backend: support NEON intrinsics.

This adds log and exp for NEON.

(tanh is already supported on all platforms via
the LLVM IR runtime)

This change also fixes tf_library() to link in the
intrinsics to the binary.

PiperOrigin-RevId: 165782270
---
 tensorflow/compiler/aot/tfcompile.bzl         |  3 +
 tensorflow/compiler/xla/service/cpu/BUILD     | 23 +++++-
 .../xla/service/cpu/compiler_functor.cc       | 76 +++++++++++++------
 .../xla/service/cpu/compiler_functor.h        |  1 +
 .../xla/service/cpu/cpu_runtime_avx.cc        | 12 +--
 .../xla/service/cpu/cpu_runtime_avx.h         | 16 ++--
 .../xla/service/cpu/cpu_runtime_neon.cc       | 46 +++++++++++
 .../xla/service/cpu/cpu_runtime_neon.h        | 62 +++++++++++++++
 .../xla/service/cpu/cpu_runtime_sse4_1.cc     | 12 +--
 .../xla/service/cpu/cpu_runtime_sse4_1.h      | 15 ++--
 .../xla/service/cpu/simple_orc_jit.cc         | 16 ++--
 11 files changed, 219 insertions(+), 63 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h

diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 12e1485b484..f9896988dc2 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -178,6 +178,9 @@ def tf_library(name, graph, config,
           "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
           "//tensorflow/compiler/aot:runtime",
           "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
+          "//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
           "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
           "//tensorflow/compiler/xla/service/cpu:runtime_matmul",
           "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 50c1dde1d3b..6d40cdd5ac2 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -102,6 +102,7 @@ cc_library(
         ":compiler_functor",
         ":cpu_runtime",
         ":cpu_runtime_avx",
+        ":cpu_runtime_neon",
         ":cpu_runtime_sse4_1",
         ":disassembler",
         ":runtime_conv2d",
@@ -284,6 +285,7 @@ cc_library(
     deps = [
         ":cpu_runtime",
         ":cpu_runtime_avx",
+        ":cpu_runtime_neon",
         ":cpu_runtime_sse4_1",
         ":disassembler",
         ":llvm_ir_runtime",
@@ -309,11 +311,11 @@ cc_library(
     srcs = ["cpu_runtime_sse4_1.cc"],
     hdrs = ["cpu_runtime_sse4_1.h"],
     copts = ["-DEIGEN_AVOID_STL_ARRAY"],
+    visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:lib",
+        "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
-    alwayslink = True,
 )
 
 cc_library(
@@ -321,11 +323,24 @@ cc_library(
     srcs = ["cpu_runtime_avx.cc"],
     hdrs = ["cpu_runtime_avx.h"],
     copts = ["-DEIGEN_AVOID_STL_ARRAY"],
+    visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:lib",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "cpu_runtime_neon",
+    srcs = ["cpu_runtime_neon.cc"],
+    hdrs = ["cpu_runtime_neon.h"],
+    # runtime_copts() enables -mfpu=neon
+    copts = ["-DEIGEN_AVOID_STL_ARRAY"] + runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
-    alwayslink = True,
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 3d49ec34d9b..08eabd66828 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
 #include "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -54,6 +55,7 @@ CompilerFunctor::AllIntrinsics() {
   VectorIntrinsics intrinsics;
   intrinsics.sse_intrinsics = true;
   intrinsics.avx_intrinsics = true;
+  intrinsics.neon_intrinsics = true;
   return intrinsics;
 }
 
@@ -150,20 +152,28 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
     CompilerFunctor::VectorIntrinsics const& available_intrinsics) {
   std::vector<llvm::VecDesc> vector_functions;
 
-  const llvm::VecDesc four_wide_vector_functions[] = {
-      {"expf", runtime::kExpV4F32SymbolName, 4},
-      {"llvm.exp.f32", runtime::kExpV4F32SymbolName, 4},
+  const llvm::VecDesc four_wide_vector_functions_neon[] = {
+      {"expf", runtime::kExpV4F32NEONSymbolName, 4},
+      {"llvm.exp.f32", runtime::kExpV4F32NEONSymbolName, 4},
 
-      {"logf", runtime::kLogV4F32SymbolName, 4},
-      {"llvm.log.f32", runtime::kLogV4F32SymbolName, 4},
+      {"logf", runtime::kLogV4F32NEONSymbolName, 4},
+      {"llvm.log.f32", runtime::kLogV4F32NEONSymbolName, 4},
   };
 
-  const llvm::VecDesc eight_wide_vector_functions[] = {
-      {"expf", runtime::kExpV8F32SymbolName, 8},
-      {"llvm.exp.f32", runtime::kExpV8F32SymbolName, 8},
+  const llvm::VecDesc four_wide_vector_functions_sse[] = {
+      {"expf", runtime::kExpV4F32SSESymbolName, 4},
+      {"llvm.exp.f32", runtime::kExpV4F32SSESymbolName, 4},
 
-      {"logf", runtime::kLogV8F32SymbolName, 8},
-      {"llvm.log.f32", runtime::kLogV8F32SymbolName, 8},
+      {"logf", runtime::kLogV4F32SSESymbolName, 4},
+      {"llvm.log.f32", runtime::kLogV4F32SSESymbolName, 4},
+  };
+
+  const llvm::VecDesc eight_wide_vector_functions_avx[] = {
+      {"expf", runtime::kExpV8F32AVXSymbolName, 8},
+      {"llvm.exp.f32", runtime::kExpV8F32AVXSymbolName, 8},
+
+      {"logf", runtime::kLogV8F32AVXSymbolName, 8},
+      {"llvm.log.f32", runtime::kLogV8F32AVXSymbolName, 8},
   };
 
   // These functions are generated by XLA as LLVM IR, so they're always
@@ -176,27 +186,45 @@ std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
       {"llvm.tanh.f32", runtime::kTanhV8F32SymbolName, 8},
   };
 
-  if (arch == llvm::Triple::x86 || llvm::Triple::x86_64) {
-    llvm::SmallVector<llvm::StringRef, 32> features;
-    feature_string.split(features, ',', -1, /*KeepEmpty=*/false);
-    if (std::find(features.begin(), features.end(), "+sse4.1") !=
-            features.end() &&
-        available_intrinsics.sse_intrinsics) {
-      vector_functions.insert(vector_functions.end(),
-                              std::begin(four_wide_vector_functions),
-                              std::end(four_wide_vector_functions));
+  llvm::SmallVector<llvm::StringRef, 32> features;
+  feature_string.split(features, ',', -1, /*KeepEmpty=*/false);
+  auto has_feature = [&features](const llvm::StringRef feature) {
+    return std::find(features.begin(), features.end(), feature) !=
+           features.end();
+  };
+
+  switch (arch) {
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64: {
+      if (has_feature("+sse4.1") && available_intrinsics.sse_intrinsics) {
+        vector_functions.insert(vector_functions.end(),
+                                std::begin(four_wide_vector_functions_sse),
+                                std::end(four_wide_vector_functions_sse));
+      }
+      if (has_feature("+avx") && available_intrinsics.avx_intrinsics) {
+        vector_functions.insert(vector_functions.end(),
+                                std::begin(eight_wide_vector_functions_avx),
+                                std::end(eight_wide_vector_functions_avx));
+      }
+      break;
     }
-    if (std::find(features.begin(), features.end(), "+avx") != features.end() &&
-        available_intrinsics.avx_intrinsics) {
-      vector_functions.insert(vector_functions.end(),
-                              std::begin(eight_wide_vector_functions),
-                              std::end(eight_wide_vector_functions));
+    case llvm::Triple::arm:
+    case llvm::Triple::aarch64: {
+      if (has_feature("+neon") && available_intrinsics.neon_intrinsics) {
+        vector_functions.insert(vector_functions.end(),
+                                std::begin(four_wide_vector_functions_neon),
+                                std::end(four_wide_vector_functions_neon));
+      }
+      break;
     }
+    default:
+      break;
   }
 
   vector_functions.insert(vector_functions.end(),
                           std::begin(ir_vector_functions),
                           std::end(ir_vector_functions));
+
   return vector_functions;
 }
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index e9dbd416c5a..7187f14c96a 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -35,6 +35,7 @@ class CompilerFunctor {
   struct VectorIntrinsics {
     bool sse_intrinsics;
     bool avx_intrinsics;
+    bool neon_intrinsics;
   };
 
   // Returns a VectorIntrinsics where all intrinsics are available.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
index 9d8e67897ce..181deedde71 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 
 #ifdef __AVX__
-xla::cpu::runtime::V8F32 __xla_cpu_runtime_ExpV8F32(
-    xla::cpu::runtime::V8F32 x) {
+xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
+    xla::cpu::runtime::V8F32AVX x) {
   return Eigen::internal::pexp(x);
 }
 
-xla::cpu::runtime::V8F32 __xla_cpu_runtime_LogV8F32(
-    xla::cpu::runtime::V8F32 x) {
+xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
+    xla::cpu::runtime::V8F32AVX x) {
   return Eigen::internal::plog(x);
 }
 #endif  // __AVX__
@@ -35,8 +35,8 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-const char *const kExpV8F32SymbolName = "__xla_cpu_runtime_ExpV8F32";
-const char *const kLogV8F32SymbolName = "__xla_cpu_runtime_LogV8F32";
+const char *const kExpV8F32AVXSymbolName = "__xla_cpu_runtime_ExpV8F32AVX";
+const char *const kLogV8F32AVXSymbolName = "__xla_cpu_runtime_LogV8F32AVX";
 
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
index 62e352f1e4f..acfada8540d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
@@ -28,11 +28,10 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-extern const char *const kExpV8F32SymbolName;
-extern const char *const kLogV8F32SymbolName;
-extern const char *const kTanhV8F32SymbolName;
+extern const char *const kExpV8F32AVXSymbolName;
+extern const char *const kLogV8F32AVXSymbolName;
 
-typedef float V8F32 __attribute__((__vector_size__(32)));
+typedef float V8F32AVX __attribute__((__vector_size__(32)));
 }  // namespace runtime
 }  // namespace cpu
 }  // namespace xla
@@ -42,12 +41,11 @@ extern "C" {
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
-xla::cpu::runtime::V8F32 __xla_cpu_runtime_ExpV8F32(xla::cpu::runtime::V8F32 x)
-    TF_ATTRIBUTE_WEAK;
-
-xla::cpu::runtime::V8F32 __xla_cpu_runtime_LogV8F32(xla::cpu::runtime::V8F32 x)
-    TF_ATTRIBUTE_WEAK;
+xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
+    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
 
+xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
+    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
new file mode 100644
index 00000000000..abe792b2787
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h"
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/Eigen/Core"
+
+#ifdef __ARM_NEON__
+
+xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
+    xla::cpu::runtime::V4F32NEON x) {
+  return Eigen::internal::pexp(x);
+}
+
+xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
+    xla::cpu::runtime::V4F32NEON x) {
+  Eigen::internal::Packet4f p = x;
+  return Eigen::internal::plog(p);
+}
+
+#endif  // __ARM_NEON__
+
+namespace xla {
+namespace cpu {
+namespace runtime {
+
+const char *const kExpV4F32NEONSymbolName = "__xla_cpu_runtime_ExpV4F32NEON";
+const char *const kLogV4F32NEONSymbolName = "__xla_cpu_runtime_LogV4F32NEON";
+
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
new file mode 100644
index 00000000000..75cb16b2739
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
+
+// This header declares functions which may be called by the generated code on
+// the CPU. Calls to these functions must be resolved explicitly in the JIT in
+// xla::cpu::SimpleResolver.
+
+#include "tensorflow/core/platform/macros.h"
+
+#ifdef __ARM_NEON__
+// For the other runtimes (AVX, SSE4.1) we define the vector type directly using
+// __attribute__((__vector_size__(*))).  Unfortunately, the typedef for the ARM
+// NEON SIMD types is not portable, so the type has to come from <arm_neon.h>
+#include <arm_neon.h>
+#endif  // __ARM_NEON__
+
+namespace xla {
+namespace cpu {
+namespace runtime {
+
+extern const char *const kExpV4F32NEONSymbolName;
+extern const char *const kLogV4F32NEONSymbolName;
+
+#ifdef __ARM_NEON__
+typedef float32x4_t V4F32NEON;
+#else
+// On non-ARM platforms ensure the declaration is present
+struct V4F32NEON;
+#endif  // __ARM_NEON__
+
+}  // namespace runtime
+}  // namespace cpu
+}  // namespace xla
+
+extern "C" {
+
+// The following functions are vectorized versions of a selection of libm
+// library functions.
+// References to these functions are created by the LLVM vectorizer.
+xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
+    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
+
+xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
+    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
index 93a1a3a3c38..a9a45db5a42 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
@@ -21,14 +21,14 @@ limitations under the License.
 
 #ifdef __SSE4_1__
 
-xla::cpu::runtime::V4F32 __xla_cpu_runtime_ExpV4F32(
-    xla::cpu::runtime::V4F32 x) {
+xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
+    xla::cpu::runtime::V4F32SSE x) {
   Eigen::internal::Packet4f p = x;
   return Eigen::internal::pexp(p);
 }
 
-xla::cpu::runtime::V4F32 __xla_cpu_runtime_LogV4F32(
-    xla::cpu::runtime::V4F32 x) {
+xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
+    xla::cpu::runtime::V4F32SSE x) {
   Eigen::internal::Packet4f p = x;
   return Eigen::internal::plog(p);
 }
@@ -39,8 +39,8 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-const char *const kExpV4F32SymbolName = "__xla_cpu_runtime_ExpV4F32";
-const char *const kLogV4F32SymbolName = "__xla_cpu_runtime_LogV4F32";
+const char *const kExpV4F32SSESymbolName = "__xla_cpu_runtime_ExpV4F32SSE";
+const char *const kLogV4F32SSESymbolName = "__xla_cpu_runtime_LogV4F32SSE";
 
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
index 591682a943f..96587d10d2b 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
@@ -28,11 +28,10 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-extern const char *const kExpV4F32SymbolName;
-extern const char *const kLogV4F32SymbolName;
-extern const char *const kTanhV4F32SymbolName;
+extern const char *const kExpV4F32SSESymbolName;
+extern const char *const kLogV4F32SSESymbolName;
 
-typedef float V4F32 __attribute__((__vector_size__(16)));
+typedef float V4F32SSE __attribute__((__vector_size__(16)));
 
 }  // namespace runtime
 }  // namespace cpu
@@ -43,11 +42,11 @@ extern "C" {
 // The following functions are vectorized versions of a selection of libm
 // library functions.
 // References to these functions are created by the LLVM vectorizer.
-xla::cpu::runtime::V4F32 __xla_cpu_runtime_ExpV4F32(xla::cpu::runtime::V4F32 x)
-    TF_ATTRIBUTE_WEAK;
+xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
+    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
 
-xla::cpu::runtime::V4F32 __xla_cpu_runtime_LogV4F32(xla::cpu::runtime::V4F32 x)
-    TF_ATTRIBUTE_WEAK;
+xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
+    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index f45e30ce0d6..573647534fa 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
@@ -91,10 +92,12 @@ class JITSymbolTable {
     ADD_JIT_SYMBOL_TO_TABLE(ReleaseInfeedBufferAfterDequeue);
     ADD_JIT_SYMBOL_TO_TABLE(AcquireOutfeedBufferForPopulation);
     ADD_JIT_SYMBOL_TO_TABLE(ReleaseOutfeedBufferAfterPopulation);
-    ADD_JIT_SYMBOL_TO_TABLE(ExpV8F32);
-    ADD_JIT_SYMBOL_TO_TABLE(LogV8F32);
-    ADD_JIT_SYMBOL_TO_TABLE(ExpV4F32);
-    ADD_JIT_SYMBOL_TO_TABLE(LogV4F32);
+    ADD_JIT_SYMBOL_TO_TABLE(ExpV8F32AVX);
+    ADD_JIT_SYMBOL_TO_TABLE(LogV8F32AVX);
+    ADD_JIT_SYMBOL_TO_TABLE(ExpV4F32SSE);
+    ADD_JIT_SYMBOL_TO_TABLE(LogV4F32SSE);
+    ADD_JIT_SYMBOL_TO_TABLE(ExpV4F32NEON);
+    ADD_JIT_SYMBOL_TO_TABLE(LogV4F32NEON);
     ADD_JIT_SYMBOL_TO_TABLE(EigenConvF32);
     ADD_JIT_SYMBOL_TO_TABLE(EigenMatMulF32);
     ADD_JIT_SYMBOL_TO_TABLE(EigenMatMulF64);
@@ -162,8 +165,9 @@ llvm::StringRef GetHostCpuName() {
 
 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
   CompilerFunctor::VectorIntrinsics intrinsics;
-  intrinsics.sse_intrinsics = (&__xla_cpu_runtime_ExpV4F32 != nullptr);
-  intrinsics.avx_intrinsics = (&__xla_cpu_runtime_ExpV8F32 != nullptr);
+  intrinsics.sse_intrinsics = (&__xla_cpu_runtime_ExpV4F32SSE != nullptr);
+  intrinsics.avx_intrinsics = (&__xla_cpu_runtime_ExpV8F32AVX != nullptr);
+  intrinsics.neon_intrinsics = (&__xla_cpu_runtime_ExpV4F32NEON != nullptr);
   return intrinsics;
 }
 

From 6e1513c7f8e67593c58713e524f9eda147004beb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 18 Aug 2017 22:28:18 -0700
Subject: [PATCH 52/70] Reduce time of reduction_ops_test

PiperOrigin-RevId: 165788460
---
 .../python/kernel_tests/reduction_ops_test.py | 31 ++++---------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 921943b69ca..b98a04d72cc 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -198,28 +198,9 @@ class SumReductionTest(BaseReductionTest):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
-    # make sure we test all possible kernel invocations
-    # logic is the same for all ops, test just float32 for brevity
-    for size_x in [
-        1, 3, 16, 17, 32, 33, 256, 257, 512, 513, 1024, 1025, 4096, 4097
-    ]:
-      for size_y in [
-          1, 3, 16, 17, 32, 33, 256, 257, 512, 513, 1024, 1025, 4096, 4097
-      ]:
-        arr = np.ones([size_x, size_y], dtype=np.float32)
-        col_sum = np.sum(arr, axis=0)
-        row_sum = np.sum(arr, axis=1)
-
-        with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce(arr, 1, False)
-          tf_col_sum = self._tf_reduce(arr, 0, False)
-          tf_out_row, tf_out_col = sess.run([tf_row_sum, tf_col_sum])
-        self.assertAllClose(col_sum, tf_out_col)
-        self.assertAllClose(row_sum, tf_out_row)
-
-    for _ in range(1000):
-      size_x = int(2**np.random.uniform(0, 18))
-      size_y = int(2**np.random.uniform(0, 18))
+    for _ in range(10):
+      size_x = int(2**np.random.uniform(0, 15))
+      size_y = int(2**np.random.uniform(0, 15))
 
       if size_x * size_y > 1e7:
         size_y = int(1e7 / size_x)
@@ -235,9 +216,9 @@ class SumReductionTest(BaseReductionTest):
       self.assertAllClose(col_sum, tf_out_col)
       self.assertAllClose(row_sum, tf_out_row)
 
-    for size_x in [1, 3, 16, 33, 65, 129]:
-      for size_y in [1, 3, 16, 33, 65, 129]:
-        for size_z in [1, 3, 16, 33, 65, 129]:
+    for size_x in [1, 3, 16, 33]:
+      for size_y in [1, 3, 16, 33]:
+        for size_z in [1, 3, 16, 33]:
           arr = np.ones([size_x, size_y, size_z], dtype=np.float32)
           sum_y = np.sum(arr, axis=1)
           sum_xz = np.sum(arr, axis=(0, 2))

From 181267c51f4827e351bb64a620b3dff83539bc11 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <jingyue@google.com>
Date: Sat, 19 Aug 2017 08:26:27 -0700
Subject: [PATCH 53/70] Produce valid node names for transpose.

Use ParseTensorName to convert node_->input(pos), a tensor name, to a node
name. This wasn't caught before likely because :0 in NodeDef::inputs can be
omitted.

PiperOrigin-RevId: 165810764
---
 tensorflow/core/grappler/optimizers/layout_optimizer.cc     | 6 ++++--
 .../core/grappler/optimizers/layout_optimizer_test.cc       | 4 ++--
 tensorflow/python/grappler/layout_optimizer_test.py         | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 4329dfeb590..f469f9a9acd 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -267,11 +267,13 @@ class NodeProcessor {
   virtual Status AddLayoutTransposeToInputs() {
     std::vector<int> input_pos = GetInputPos();
     for (const auto& pos : input_pos) {
-      string base_name = strings::StrCat(node_->name(), "-", node_->input(pos));
+      int output_pos;
+      string input_node_name = ParseNodeName(node_->input(pos), &output_pos);
+      string base_name =
+          strings::StrCat(node_->name(), "-", input_node_name, "-", output_pos);
       string node_name =
           AddPrefixToNodeName(base_name, kTransposeNHWCToNCHW, "-");
       auto input_node = node_map_->GetNode(node_->input(pos));
-      int output_pos = NodePosition(node_->input(pos));
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
       TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
       AddNodeTranspose(
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 5edfe8065f0..7ebc9aaf1c1 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -182,7 +182,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   NodeMap node_map(&output);
   EXPECT_TRUE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
 }
 
 TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
@@ -197,7 +197,7 @@ TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   NodeMap node_map(&output);
   EXPECT_TRUE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
 }
 
 }  // namespace
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index d372e3a677e..5dbaf76edb6 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -98,7 +98,8 @@ class LayoutOptimizerTest(test.TestCase):
       # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
-      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape', nodes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape-0',
+                    nodes)
       self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-MaxPool_1',
                     nodes)
 

From e1030858725b485b0f848cc27597b8e2c2d8383f Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Sat, 19 Aug 2017 10:25:30 -0700
Subject: [PATCH 54/70] Don't create cond_input and body_input nodes when
 finishing while loop

These nodes are not needed, but they caused failures in functions
with while loops because functions currently execute all ops in their
bodies and these ops are placeholders without feeds.

PiperOrigin-RevId: 165814802
---
 tensorflow/c/c_api.cc                         |  2 +
 tensorflow/c/while_loop_test.cc               | 10 +++
 tensorflow/core/graph/graph_constructor.cc    | 52 ++++++++++--
 tensorflow/core/graph/graph_constructor.h     |  6 +-
 .../core/graph/graph_constructor_test.cc      | 80 +++++++++++++++++++
 5 files changed, 142 insertions(+), 8 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 77e0cf6757c..1ea70f05981 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -1931,6 +1931,8 @@ bool CopyGraph(TF_Graph* src_graph, TF_Graph* dst_graph,
     TF_ImportGraphDefOptionsAddInputMapping(opts.get(), src.first.data(),
                                             src.second, dst_inputs[i]);
   }
+  opts.get()->opts.skip_mapped_nodes = true;
+
   // We use the pivot node to control constants in `src_graph`
   TF_Operation* pivot = dst_inputs[0].oper;
   TF_ImportGraphDefOptionsAddControlDependency(opts.get(), pivot);
diff --git a/tensorflow/c/while_loop_test.cc b/tensorflow/c/while_loop_test.cc
index 7f7e3622690..ce4f86bb25b 100644
--- a/tensorflow/c/while_loop_test.cc
+++ b/tensorflow/c/while_loop_test.cc
@@ -174,6 +174,16 @@ TEST_F(CApiWhileLoopTest, BasicLoop) {
   EXPECT_TRUE(outputs_[1].oper != nullptr);
   EXPECT_GE(outputs_[1].index, 0);
 
+  // Check that cond and body inputs are not present
+  for (int i = 0; i < params_->ninputs; ++i) {
+    string cond_name =
+        ::tensorflow::strings::StrCat(params_->name, "/cond/cond_input", i);
+    string body_name =
+        ::tensorflow::strings::StrCat(params_->name, "/body/body_input", i);
+    EXPECT_TRUE(TF_GraphOperationByName(graph_, cond_name.c_str()) == nullptr);
+    EXPECT_TRUE(TF_GraphOperationByName(graph_, body_name.c_str()) == nullptr);
+  }
+
   // Run the graph
   Run({-9, 2});
   ExpectOutputValue(0, 3);
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 582c8727c6d..eb1a023ab29 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -76,6 +76,7 @@ class GraphConstructor {
                      ? in.prefix
                      : in.prefix + "/"),
           input_map(in.input_map),
+          skip_mapped_nodes(in.skip_mapped_nodes),
           control_dependencies(in.control_dependencies),
           return_tensors(in.return_tensors),
           importing(true) {}
@@ -85,6 +86,7 @@ class GraphConstructor {
 
     string prefix;
     std::map<TensorId, TensorId> input_map;
+    bool skip_mapped_nodes;
     std::vector<string> control_dependencies;
     std::vector<TensorId> return_tensors;
 
@@ -158,6 +160,7 @@ class GraphConstructor {
 
   void Undo();
 
+  Status IsNodeFullyMapped(const NodeDef& node_def, bool* is_node_mapped);
   Status ValidateColocationConstraints(const NodeDef& node_def);
   Status MakeNode(const NodeDef& node_def, Node** node);
   Status MakeEdge(Node* src, int output_index, Node* dst, int input_index);
@@ -668,6 +671,36 @@ void GraphConstructor::AddPrefixToNodeDef(
   }
 }
 
+Status GraphConstructor::IsNodeFullyMapped(const NodeDef& node_def,
+                                           bool* is_node_mapped) {
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(g_->op_registry()->LookUpOpDef(node_def.op(), &op_def));
+  for (int i = 0; i < op_def->output_arg_size(); ++i) {
+    if (opts_.input_map.find({node_def.name(), i}) == opts_.input_map.end()) {
+      *is_node_mapped = false;
+      return Status::OK();
+    }
+  }
+  *is_node_mapped = true;
+  return Status::OK();
+}
+
+namespace {
+
+void UpdatePendingCountAndReady(
+    const std::vector<gtl::InlinedVector<int, 4>>& outputs, int o,
+    std::vector<int>* pending_count, std::vector<int>* ready) {
+  for (size_t i = 0; i < outputs[o].size(); ++i) {
+    const int output = outputs[o][i];
+    (*pending_count)[output]--;
+    if ((*pending_count)[output] == 0) {
+      ready->push_back(output);
+    }
+  }
+}
+
+}  // anonymous namespace
+
 Status GraphConstructor::Convert() {
   // Import functions before adding nodes, since imported nodes may refer to
   // functions
@@ -703,6 +736,17 @@ Status GraphConstructor::Convert() {
     input_already_exists.resize(original_node_def.input_size(), false);
 
     if (opts_.importing) {
+      if (opts_.skip_mapped_nodes) {
+        bool is_node_mapped = false;
+        TF_RETURN_IF_ERROR(
+            IsNodeFullyMapped(original_node_def, &is_node_mapped));
+        if (is_node_mapped) {
+          // Skip this node after updating pending_count_ for outputs
+          UpdatePendingCountAndReady(outputs_, o, &pending_count_, &ready_);
+          continue;
+        }
+      }
+
       // TODO(ashankar): The line below means an additional copy of the NodeDef,
       // which can be expensive if the NodeDef contains large tensors in it.
       // Might make sense to change the API for ImportGraphDef to take a mutable
@@ -789,13 +833,7 @@ Status GraphConstructor::Convert() {
     }
 
     // Update pending_count_ for outputs.
-    for (size_t i = 0; i < outputs_[o].size(); ++i) {
-      const int output = outputs_[o][i];
-      pending_count_[output]--;
-      if (pending_count_[output] == 0) {
-        ready_.push_back(output);
-      }
-    }
+    UpdatePendingCountAndReady(outputs_, o, &pending_count_, &ready_);
   }
 
   if (processed < node_defs_.size()) {
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 7c34dd536cc..ae376ba2b9d 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -64,7 +64,7 @@ extern Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
 // TODO(ashankar): Push this mechanism and get rid of Session::Extend()
 // as a means of enhancing an existing Graph.
 struct ImportGraphDefOptions {
-  ImportGraphDefOptions() {}
+  ImportGraphDefOptions() : skip_mapped_nodes(false) {}
 
   // Name prefix to use for nodes imported from the GraphDef.  For example, if
   // prefix="animals" and GraphDef contains a node "bunny" then the node will be
@@ -86,6 +86,10 @@ struct ImportGraphDefOptions {
   // TODO(skyewm): add functionality to retrieve unused `input_map` keys
   std::map<TensorId, TensorId> input_map;
 
+  // If true, nodes that will have all output edges removed because of
+  // overrides in `input_map` will not be imported.
+  bool skip_mapped_nodes;
+
   // The names of existing nodes in `g` that the imported graph should have
   // control dependencies on.
   //
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 6be8e36ab6a..e448ce49278 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -1433,6 +1433,86 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapDuplicateNodeNames) {
       &refiner);
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_SkipMappedNodes_FullyMapped) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+
+  // Populate graph with node we'll use in input map
+  ExpectOK("node { name: 'input' op: 'TestInput' }", ImportGraphDefOptions(),
+           &refiner);
+
+  // Create input_map and use it to import more nodes
+  ImportGraphDefOptions opts;
+  opts.skip_mapped_nodes = true;
+  opts.input_map[TensorId("new_input", 0)] = TensorId("input", 1);
+  opts.input_map[TensorId("new_input", 1)] = TensorId("input", 0);
+
+  ExpectOK(
+      R"EOF(
+      node { name: 'new_input' op: 'TestInput' }
+      node { name: 't1' op: 'TestMul' input: [ 'new_input:0', 'new_input:1' ] }
+      node { name: 't2' op: 'TestMul' input: [ 't1:0', 't1:0' ] }
+      )EOF",
+      opts, &refiner);
+
+  EXPECT_TRUE(HasNode("input"));
+  EXPECT_TRUE(HasNode("t1"));
+  EXPECT_TRUE(HasNode("t2"));
+  // `new_input` node is not imported because we set skip_mapped_nodes = true
+  // and all of its inputs are mapped
+  EXPECT_FALSE(HasNode("new_input"));
+
+  EXPECT_TRUE(HasEdge("input", 1, "t1", 0));
+  EXPECT_TRUE(HasEdge("input", 0, "t1", 1));
+  // Test that t2 is unaffected
+  EXPECT_TRUE(HasEdge("t1", 0, "t2", 0));
+
+  // Check that t1's NodeDef is consistent with graph
+  Node* t1 = FindNode("t1");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  ASSERT_EQ(t1->requested_inputs()[0], "input:1");
+  ASSERT_EQ(t1->requested_inputs()[1], "input:0");
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_SkipMappedNodes_NotFullyMapped) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+
+  // Populate graph with node we'll use in input map
+  ExpectOK("node { name: 'input' op: 'TestInput' }", ImportGraphDefOptions(),
+           &refiner);
+
+  // Create input_map and use it to import more nodes
+  ImportGraphDefOptions opts;
+  opts.skip_mapped_nodes = true;
+  opts.input_map[TensorId("new_input", 1)] = TensorId("input", 0);
+
+  ExpectOK(
+      R"EOF(
+      node { name: 'new_input' op: 'TestInput' }
+      node { name: 't1' op: 'TestMul' input: [ 'new_input:0', 'new_input:1' ] }
+      node { name: 't2' op: 'TestMul' input: [ 't1:0', 't1:0' ] }
+      )EOF",
+      opts, &refiner);
+
+  EXPECT_TRUE(HasNode("input"));
+  EXPECT_TRUE(HasNode("t1"));
+  EXPECT_TRUE(HasNode("t2"));
+  // `new_input` node is imported because not all of its inputs are mapped
+  EXPECT_TRUE(HasNode("new_input"));
+
+  EXPECT_FALSE(HasEdge("input", 1, "t1", 0));
+  EXPECT_TRUE(HasEdge("input", 0, "t1", 1));
+  EXPECT_TRUE(HasEdge("new_input", 0, "t1", 0));
+  EXPECT_FALSE(HasEdge("new_input", 1, "t1", 1));
+  // Test that t2 is unaffected
+  EXPECT_TRUE(HasEdge("t1", 0, "t2", 0));
+
+  // Check that t1's NodeDef is consistent with graph
+  Node* t1 = FindNode("t1");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  ASSERT_EQ(t1->requested_inputs()[0], "new_input:0");
+  ASSERT_EQ(t1->requested_inputs()[1], "input:0");
+}
+
 TEST_F(GraphConstructorTest, ImportGraphDef_ReturnTensors) {
   ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 

From e3441ac3bf2f769d8d3070ad60f4a2e67c9a3127 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Sat, 19 Aug 2017 11:17:32 -0700
Subject: [PATCH 55/70] eager: Add tests for backprop.py

This includes some changes to operator overloading:
(a) TensorNode is more consistent with the operator overloads in Tensor
(b) Additionally, it has __eq__ and __ne__ overloads which were needed
    to make the transition from ops.embedding_lookup to tf.nn.embedding_lookup
    possible. However, this is inconsistent with overloads for Tensor
    so is something that will be looked at in a follow up.

PiperOrigin-RevId: 165816199
---
 tensorflow/python/eager/BUILD                 |  15 +-
 tensorflow/python/eager/backprop_test.py      | 185 ++++++++++++++++++
 tensorflow/python/eager/tensor_node.py        |  57 ++++--
 tensorflow/python/framework/common_shapes.py  |  54 +++--
 .../python/framework/common_shapes_test.py    |   2 +
 5 files changed, 283 insertions(+), 30 deletions(-)
 create mode 100644 tensorflow/python/eager/backprop_test.py

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index a442cbff4bd..345a478de2b 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -104,6 +104,19 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "backprop_test",
+    srcs = ["backprop_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":context",
+        ":tensor",
+        ":test",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+)
+
 cuda_py_test(
     name = "core_test",
     srcs = ["core_test.py"],
@@ -284,7 +297,7 @@ py_library(
         ":tape",
         ":tensor",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:common_shapes",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
new file mode 100644
index 00000000000..1f5bd1e9f20
--- /dev/null
+++ b/tensorflow/python/eager/backprop_test.py
@@ -0,0 +1,185 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.eager import tensor
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+
+
+class BackpropTest(test.TestCase):
+
+  def testAggregateGradients(self):
+
+    def fn(x):
+      ind1 = tensor.Tensor(np.array([0, 1]))
+      ind2 = tensor.Tensor(np.array([2, 3]))
+      ind3 = tensor.Tensor(np.array([1, 3]))
+      # A mixture of IndexedSlices and dense tensor to aggregate.
+      g1 = embedding_ops.embedding_lookup(x, ind1)
+      g2 = embedding_ops.embedding_lookup(x, ind2)
+      g3 = embedding_ops.embedding_lookup(x, ind3)
+      g4 = math_ops.reduce_sum(x * tensor.Tensor(2.0))
+      return g1 * g2 * g3 * g4
+
+    var_np = np.random.rand(4, 2).astype(np.float32)
+    var = tensor.Tensor(var_np)
+    grad = backprop.gradients_function(fn, [0])(var)[0]
+
+    with context.graph_mode(), self.test_session():
+      tf_var = array_ops.constant(var_np, dtypes.float32)
+      tf_ind1 = array_ops.constant([0, 1])
+      tf_ind2 = array_ops.constant([2, 3])
+      tf_ind3 = array_ops.constant([1, 3])
+      tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1)
+      tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2)
+      tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3)
+      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, reduction_indices=(0, 1))
+      tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4
+      tf_grad = gradients.gradients(tf_y, [tf_var])[0]
+
+      tf_dense_grad = math_ops.unsorted_segment_sum(
+          tf_grad.values, tf_grad.indices, tf_grad.dense_shape[0])
+
+      self.assertAllClose(grad.numpy(), tf_dense_grad.eval())
+
+  def testImplicitGradWithResourceVariable(self):
+    x = resource_variable_ops.ResourceVariable(initial_value=tensor.Tensor(1.0),
+                                               name='x')
+    def fn():
+      tape.watch(x.handle)
+      b = tensor.Tensor(2.0)
+      c = math_ops.add(x.value(), b)
+      return math_ops.add(c, tensor.Tensor(3.0))
+    grad = backprop.implicit_grad(fn)()[0][1]
+    self.assertEqual(grad.numpy(), 1.0)
+
+  def testGPU(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    def fn(x):
+      with context.device('/gpu:0'):
+        b = tensor.Tensor(2.0)
+        c = math_ops.add(x.as_gpu_tensor(), b)
+        # TODO(apassos): remove as_cpu_tensor below by making TensorVSPace aware
+        # of devices.
+        return math_ops.add(c, tensor.Tensor(3.0)).as_cpu_tensor()
+
+    grad = backprop.gradients_function(fn, [0])(tensor.Tensor(1.0))[0]
+    self.assertEqual(grad.numpy(), 1.0)
+
+  def testCPU(self):
+
+    def fn(x):
+      b = tensor.Tensor(2.0)
+      c = math_ops.add(x, b)
+      return math_ops.add(c, tensor.Tensor(3.0))
+
+    grad = backprop.gradients_function(fn, [0])(tensor.Tensor(1.0))[0]
+    self.assertEqual(grad.numpy(), 1.0)
+
+  def testTensorCopyGPU2CPU2GPU(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    def f(a, b):
+      return a.as_cpu_tensor() + b.as_cpu_tensor()
+
+    with context.device('/gpu:0'):
+      a = tensor.Tensor(1.0)
+      b = tensor.Tensor(2.0)
+
+    grad = backprop.gradients_function(f, [0])(a, b)[0]
+    self.assertEqual(grad.numpy(), 1.0)
+
+  def testEmptyParams(self):
+
+    def fn(a, b):
+      return a * b
+
+    x = tensor.Tensor(1.0)
+    y = tensor.Tensor(2.0)
+    dx, dy = backprop.gradients_function(fn)(x, y)
+    self.assertAllEqual(dx.numpy(), y.numpy())
+    self.assertAllEqual(dy.numpy(), x.numpy())
+
+  def testTensorCopyCPU2GPU2CPU(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    # forward: a (cpu->gpu) -> add (gpu) -> c (gpu->cpu) -> add (cpu) -> e (cpu)
+    # back: e (cpu) -> add (cpu) -> c (cpu->gpu) -> add (gpu) -> grad (gpu->cpu)
+    def f(a, b):
+      with context.device('/gpu:0'):
+        c = math_ops.add(a.as_gpu_tensor(0), b.as_gpu_tensor(0))
+      return math_ops.add(c.as_cpu_tensor(), tensor.Tensor(3.0))
+
+    with context.device('/cpu:0'):
+      a = tensor.Tensor(1.0)
+      b = tensor.Tensor(2.0)
+
+    grad = backprop.gradients_function(f, [0])(a, b)[0]
+    self.assertEqual(grad.numpy(), 1.0)
+
+  def testGetAttrType(self):
+    typ = backprop.op_attr_type('Add', 'T')
+    self.assertEqual(typ, pywrap_tensorflow.TF_ATTR_TYPE)
+
+  def testGetAttrList(self):
+    typ = backprop.op_attr_type('MaxPool', 'ksize')
+    self.assertEqual(typ, [pywrap_tensorflow.TF_ATTR_INT])
+
+  def testMakeAttrType(self):
+    self.assertEqual(dtypes.float32,
+                     backprop.make_attr(pywrap_tensorflow.TF_ATTR_TYPE, 1))
+
+  def testMakeAttrTypeList(self):
+    self.assertEqual([dtypes.float32],
+                     backprop.make_attr([pywrap_tensorflow.TF_ATTR_TYPE], [1]))
+
+  def testMakeAttrShape(self):
+    for s in ([], None, [1, 2, 3], [None, None], [1, None, 3]):
+      expected = tensor_shape.TensorShape(s).as_proto()
+      actual = backprop.make_attr(pywrap_tensorflow.TF_ATTR_SHAPE, s)
+      self.assertEqual(
+          expected,
+          actual,
+          msg=('For shape %r, expected %r != %r actual' % (s, expected,
+                                                           actual)))
+
+  def testMakeAttrShapeList(self):
+    shape_list = [[], None, [1, 2, 3], [None, None], [1, None, 3]]
+    self.assertEqual(
+        [tensor_shape.TensorShape(s).as_proto() for s in shape_list],
+        backprop.make_attr([pywrap_tensorflow.TF_ATTR_SHAPE], shape_list))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/eager/tensor_node.py b/tensorflow/python/eager/tensor_node.py
index 042141e2198..6916f459080 100644
--- a/tensorflow/python/eager/tensor_node.py
+++ b/tensorflow/python/eager/tensor_node.py
@@ -23,11 +23,11 @@ from autograd import core as ag_core
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.eager import tensor
+from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -39,6 +39,8 @@ def _tensor_numpy(t):
 @ag_core.primitive
 def _as_gpu_tensor(t, index=0):
   return t.as_gpu_tensor(gpu_index=index)
+
+
 _as_gpu_tensor.defvjp(
     lambda g, ans, vs, gvs, t, index: g.as_cpu_tensor(), argnum=0)
 
@@ -46,10 +48,18 @@ _as_gpu_tensor.defvjp(
 @ag_core.primitive
 def _as_cpu_tensor(t):
   return t.as_cpu_tensor()
-_as_cpu_tensor.defvjp(
-    lambda g, ans, vs, gvs, t: g.as_gpu_tensor(), argnum=0)
 
 
+_as_cpu_tensor.defvjp(lambda g, ans, vs, gvs, t: g.as_gpu_tensor(), argnum=0)
+
+
+# TODO(apassos,ashankar): The operator overrides here need to be kept in sync
+# with the overrides for ops.Tensor and ops.EagerTensor.
+#
+# Note that we cannot use self.value.__op__() because that would result
+# in an ops.EagerTensor instead of a TensorNode being returned.
+#
+# We need to figure out a way to ensure that the two are in sync.
 class TensorNode(ag_core.Node):
   """A TensorFlow Tensor."""
 
@@ -77,12 +87,6 @@ class TensorNode(ag_core.Node):
   def as_gpu_tensor(self, gpu_index=0):
     return _as_gpu_tensor(self, gpu_index)
 
-  def __bool__(self):
-    return self.value.__bool__()  # pylint: disable=protected-access
-
-  def __nonzero__(self):
-    return self.__bool__()
-
   def __len__(self):
     return len(self.value)
 
@@ -93,7 +97,11 @@ class TensorNode(ag_core.Node):
     return math_ops.abs(self)  # pylint: disable=protected-access
 
   def __invert__(self):
-    return self.value.__invert__()
+    # ops.Tensor used math_ops.logical_not as of August 2017.
+    # Now that bitwise_ops.invert exists, it might make sense
+    # for both ops.Tensor and TensorNode to use that if the
+    # type is compatible.
+    return math_ops.logical_not(self)
 
   def __hash__(self):
     return id(self)
@@ -155,10 +163,20 @@ class TensorNode(ag_core.Node):
     return math_ops.floordiv(other, self)
 
   def __eq__(self, other):
-    return control_flow_ops.equal(self, other)  # pylint: disable=protected-access
+    # math_ops.equal raises an error if shapes are not compatible, so check that
+    # explicitly first.
+    if common_shapes.is_broadcast_compatible(
+        self.shape, ops.convert_to_tensor(other).shape):
+      return math_ops.equal(self, other)
+    return False
 
   def __ne__(self, other):
-    return control_flow_ops.not_equal(self, other)  # pylint: disable=protected-access
+    # math_ops.not_equal raises an error if shapes are not compatible, so check
+    # explicitly first.
+    if common_shapes.is_broadcast_compatible(
+        self.shape, ops.convert_to_tensor(other).shape):
+      return math_ops.not_equal(self, other)
+    return True
 
   def __gt__(self, other):
     return math_ops.greater(self, other)
@@ -172,6 +190,7 @@ class TensorNode(ag_core.Node):
   def __le__(self, other):
     return math_ops.less_equal(self, other)
 
+
 ag_core.register_node(TensorNode, tensor.Tensor)
 ag_core.register_node(TensorNode, ops.Tensor)
 
@@ -183,13 +202,14 @@ def _zeros(shape, dtype):
 
 
 def _ones(shape, dtype):
-  return array_ops.fill(tensor.Tensor(shape, dtype=dtypes.int32),
-                        tensor.Tensor(1, dtype=dtype))
+  return array_ops.fill(
+      tensor.Tensor(shape, dtype=dtypes.int32), tensor.Tensor(1, dtype=dtype))
 
 
 def _lazy_zero_tensor(zero):
   return _zeros(zero.shape, zero.dtype)
 
+
 tensor.LazyZero.tensor = _lazy_zero_tensor
 
 
@@ -197,8 +217,8 @@ def _lazy_zero_to_tensor(lazy_zero, dtype=None, name=None, as_ref=False):
   del as_ref, name, dtype
   return _zeros(lazy_zero.shape, lazy_zero.dtype)
 
-ops.register_tensor_conversion_function(tensor.LazyZero,
-                                        _lazy_zero_to_tensor)
+
+ops.register_tensor_conversion_function(tensor.LazyZero, _lazy_zero_to_tensor)
 
 
 def _indexed_slices_to_tensor(value):
@@ -217,8 +237,8 @@ def _indexed_slices_to_tensor(value):
     raise ValueError(
         "Tensor conversion requested for IndexedSlices without dense_shape: %s"
         % str(value))
-  return math_ops.unsorted_segment_sum(
-      value.values, value.indices, value.dense_shape[0])
+  return math_ops.unsorted_segment_sum(value.values, value.indices,
+                                       value.dense_shape[0])
 
 
 class TensorVSpace(ag_core.VSpace):
@@ -272,6 +292,7 @@ class TensorVSpace(ag_core.VSpace):
       y = _indexed_slices_to_tensor(y)
     return math_ops.add(x, y)
 
+
 ag_core.register_vspace(TensorVSpace, tensor.Tensor)
 ag_core.register_vspace(TensorVSpace, ops.Tensor)
 ag_core.register_vspace(TensorVSpace, ops.IndexedSlices)
diff --git a/tensorflow/python/framework/common_shapes.py b/tensorflow/python/framework/common_shapes.py
index 52a0b9baa6b..3b1092f9231 100644
--- a/tensorflow/python/framework/common_shapes.py
+++ b/tensorflow/python/framework/common_shapes.py
@@ -499,22 +499,17 @@ def unknown_shape(op):
   return [tensor_shape.unknown_shape() for _ in op.outputs]
 
 
-def broadcast_shape(shape_x, shape_y):
-  """Returns the broadcasted shape between `shape_x` and `shape_y`.
+def _broadcast_shape_helper(shape_x, shape_y):
+  """Helper functions for is_broadcast_compatible and broadcast_shape.
 
   Args:
     shape_x: A `TensorShape`
     shape_y: A `TensorShape`
 
   Returns:
-    A `TensorShape` representing the broadcasted shape.
-
-  Raises:
-    ValueError: If the two shapes can not be broadcasted.
+    Returns None if the shapes are not broadcast compatible,
+    a list of the broadcast dimensions otherwise.
   """
-  if shape_x.ndims is None or shape_y.ndims is None:
-    return tensor_shape.unknown_shape()
-
   # To compute the broadcasted dimensions, we zip together shape_x and shape_y,
   # and pad with 1 to make them the same length.
   broadcasted_dims = reversed(list(six.moves.zip_longest(
@@ -548,8 +543,45 @@ def broadcast_shape(shape_x, shape_y):
       # dimension.
       return_dims.append(dim_x.merge_with(dim_y))
     else:
-      raise ValueError("Incompatible shapes for broadcasting: %s and %s"
-                       % (shape_x, shape_y))
+      return None
+  return return_dims
+
+
+def is_broadcast_compatible(shape_x, shape_y):
+  """Returns True if `shape_x` and `shape_y` are broadcast compatible.
+
+  Args:
+    shape_x: A `TensorShape`
+    shape_y: A `TensorShape`
+
+  Returns:
+    True if a shape exists that both `shape_x` and `shape_y` can be broadcasted
+    to.  False otherwise.
+  """
+  if shape_x.ndims is None or shape_y.ndims is None:
+    return False
+  return _broadcast_shape_helper(shape_x, shape_y) is not None
+
+
+def broadcast_shape(shape_x, shape_y):
+  """Returns the broadcasted shape between `shape_x` and `shape_y`.
+
+  Args:
+    shape_x: A `TensorShape`
+    shape_y: A `TensorShape`
+
+  Returns:
+    A `TensorShape` representing the broadcasted shape.
+
+  Raises:
+    ValueError: If the two shapes can not be broadcasted.
+  """
+  if shape_x.ndims is None or shape_y.ndims is None:
+    return tensor_shape.unknown_shape()
+  return_dims = _broadcast_shape_helper(shape_x, shape_y)
+  if return_dims is None:
+    raise ValueError("Incompatible shapes for broadcasting: %s and %s"
+                     % (shape_x, shape_y))
   return tensor_shape.TensorShape(return_dims)
 
 
diff --git a/tensorflow/python/framework/common_shapes_test.py b/tensorflow/python/framework/common_shapes_test.py
index 62d9b568041..24e079eefbe 100644
--- a/tensorflow/python/framework/common_shapes_test.py
+++ b/tensorflow/python/framework/common_shapes_test.py
@@ -38,6 +38,8 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
         np.broadcast(zeros1, zeros2)
       with self.assertRaises(ValueError):
         np.broadcast(zeros2, zeros1)
+    self.assertFalse(common_shapes.is_broadcast_compatible(shape1, shape2))
+    self.assertFalse(common_shapes.is_broadcast_compatible(shape2, shape1))
     with self.assertRaises(ValueError):
       common_shapes.broadcast_shape(shape1, shape2)
     with self.assertRaises(ValueError):

From c572ca84f40bce6a6114fea2ae4c5da174ffd73b Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Sat, 19 Aug 2017 13:59:41 -0700
Subject: [PATCH 56/70] eager: Add tests for function.py

PiperOrigin-RevId: 165821200
---
 tensorflow/python/eager/BUILD            |  15 ++
 tensorflow/python/eager/function_test.py | 230 +++++++++++++++++++++++
 2 files changed, 245 insertions(+)
 create mode 100644 tensorflow/python/eager/function_test.py

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 345a478de2b..13f4fafcc84 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -133,6 +133,21 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "function_test",
+    srcs = ["function_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":context",
+        ":function",
+        ":tape",
+        ":tensor",
+        ":test",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "test",
     srcs = ["test.py"],
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
new file mode 100644
index 00000000000..18b722e7923
--- /dev/null
+++ b/tensorflow/python/eager/function_test.py
@@ -0,0 +1,230 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import tape
+from tensorflow.python.eager import tensor
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function as tf_function
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+
+
+class FunctionTest(test.TestCase):
+
+  def testBasic(self):
+    matmul = function.defun(math_ops.matmul)
+    t = tensor.Tensor([[1.0, 2.0], [3.0, 4.0]])
+    sq = matmul(t, t, transpose_a=True)
+    sq2 = matmul(sq, t, transpose_a=True)
+    self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20])
+    self.assertAllEqual(sq2.numpy().reshape(-1), [52, 76, 74, 108])
+
+  def testBasicGraphMode(self):
+    matmul = function.defun(math_ops.matmul)
+
+    @function.defun
+    def sq(a):
+      return matmul(a, a)
+
+    t = tensor.Tensor([[1.0, 2.0], [3.0, 4.0]])
+    out = sq(t)
+    self.assertAllEqual(out.numpy(), math_ops.matmul(t, t).numpy())
+
+  def testTensorConversionWithDefun(self):
+
+    @function.defun
+    def f(x):
+      return math_ops.add(x, tensor.Tensor(3))
+
+    self.assertAllEqual(5, f(tensor.Tensor(2)).numpy())
+
+  def testTensorConversionCall(self):
+
+    @function.defun
+    def f(x):
+      return math_ops.add(x, tensor.Tensor(3))
+
+    @function.defun
+    def g(x):
+      return f(f(x))
+
+    self.assertAllEqual(8, g(tensor.Tensor(2)).numpy())
+
+  def testDefunCallBackprop(self):
+
+    @function.defun
+    def f(x):
+      return math_ops.add(x, x)
+
+    @function.defun
+    def g(x):
+      return backprop.gradients_function(f, [0])(x)[0]
+
+    self.assertAllEqual(2, g(tensor.Tensor(2)).numpy())
+
+  def testCallShape(self):
+
+    @function.defun
+    def f(x):
+      return x + 1
+
+    @function.defun
+    def g(x):
+      x = f(x)
+      self.assertEqual(x.shape.as_list(), [])
+      return None
+
+    g(tensor.Tensor(1.0))
+
+  def testGradientTensorConversionWithDefun(self):
+    three = tensor.Tensor(3.0)
+
+    @function.defun
+    def f(x):
+      return math_ops.add(x, three)
+
+    def g(x):
+      tape.watch(three)
+      return f(x)
+
+    g = backprop.implicit_grad(g)(tensor.Tensor(1.0))[0][1]
+    self.assertEqual(g.numpy(), 1.0)
+
+  def testGradient(self):
+    matmul = function.defun(math_ops.matmul)
+
+    def sq(x):
+      return matmul(x, x, transpose_a=True)
+
+    t = tensor.Tensor([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t.numpy(), [[6, 6], [14, 14]])
+
+  def testGradientInFunction(self):
+
+    @function.defun
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertEqual(f(tensor.Tensor(1.0)).numpy(), 2.0)
+
+  def testFunctionOnDevice(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    x = tensor.Tensor([1.]).as_gpu_tensor()
+    f = function.defun(math_ops.add)
+    y = f(x, x).as_cpu_tensor()
+    self.assertAllEqual(y.numpy(), [2.])
+
+  def testFunctionHandlesInputsOnDifferentDevices(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    # The Reshape op requires the shape tensor to be placed in host memory.
+    reshape = function.defun(array_ops.reshape)
+    value = tensor.Tensor([1., 2.]).as_gpu_tensor()
+    shape = tensor.Tensor([2, 1])
+    reshaped = reshape(value, shape).as_cpu_tensor()
+    self.assertAllEqual(reshaped.numpy(), [[1], [2]])
+
+  def testFunctionHandlesInputsPlacedOnTheWrongDeviceGracefully(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    # The Reshape op requires the shape tensor to be placed in host memory.
+    reshape = function.defun(array_ops.reshape)
+    value = tensor.Tensor([1., 2.]).as_gpu_tensor()
+    shape = tensor.Tensor([2, 1]).as_gpu_tensor()
+    with self.assertRaises(errors.InvalidArgumentError):
+      reshape(value, shape)
+
+  def testDifferentiableFunctionNoneOutputs(self):
+
+    @function.defun
+    def my_function(x):
+      return x, None
+
+    def wrapper(x):
+      return my_function(x)[0]
+
+    g = backprop.gradients_function(wrapper, [0])(tensor.Tensor(0.0))
+    self.assertAllEqual(g[0].numpy(), 1.)
+
+  def testNoneOutput(self):
+
+    @function.defun
+    def my_function(_):
+      return None
+
+    self.assertAllEqual(my_function(1), None)
+
+  def testNestedFunctions(self):
+    # TensorFlow function (which is what would be used in TensorFlow graph
+    # construction).
+    @tf_function.Defun(dtypes.int32, dtypes.int32)
+    def add(a, b):
+      return math_ops.add(a, b)
+
+    @function.defun
+    def add_one(x):
+      return add(x, 1)
+
+    self.assertAllEqual(3, add_one(tensor.Tensor(2)).numpy())
+
+  def testSequenceInputs(self):
+    clip_by_global_norm = function.defun(clip_ops.clip_by_global_norm)
+    t_list = [tensor.Tensor(1.0), tensor.Tensor(2.0)]
+    clipped_list, global_norm = clip_by_global_norm(t_list, tensor.Tensor(.2))
+    for t in clipped_list:
+      self.assertTrue(isinstance(t, tensor.Tensor))
+    self.assertTrue(isinstance(global_norm, tensor.Tensor))
+
+  def testNestedSequenceInputs(self):
+
+    def my_op(inputs):
+      a, b, c = inputs
+      e, f = b
+      g, h = e
+      return [a + a, [tuple([f + f, g + g]), h + h], c + c], a + f + g + h + c
+
+    my_eager_op = function.defun(my_op)
+    ret = my_eager_op([
+        tensor.Tensor(1), [(tensor.Tensor(2), tensor.Tensor(3)),
+                           tensor.Tensor(4)],
+        tensor.Tensor(5)
+    ])
+    self.assertEqual(len(ret), 2)
+    self.assertEqual(ret[0][0].numpy(), 2)
+    self.assertEqual(ret[0][1][0][0].numpy(), 8)
+    self.assertEqual(ret[0][1][0][1].numpy(), 4)
+    self.assertTrue(isinstance(ret[0][1][0], tuple))
+    self.assertEqual(ret[0][1][1].numpy(), 6)
+    self.assertEqual(ret[0][2].numpy(), 10)
+    self.assertEqual(ret[1].numpy(), 15)
+
+
+if __name__ == '__main__':
+  test.main()

From 8903e5fc72aba856c5567d09b41340a5e32d4f8f Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Sat, 19 Aug 2017 16:33:35 -0700
Subject: [PATCH 57/70] [XLA] Make ShapeUtil::ParseShapeString more complete.

Handle tuples, nested tuples, more element types.

PiperOrigin-RevId: 165826211
---
 tensorflow/compiler/xla/shape_util.cc      | 88 +++++++++++++++++-----
 tensorflow/compiler/xla/shape_util.h       |  2 +-
 tensorflow/compiler/xla/shape_util_test.cc | 24 ++++++
 3 files changed, 93 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index b84494d34a3..b71b3a9e131 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -425,13 +425,42 @@ const string& LowercasePrimitiveTypeName(PrimitiveType s) {
       HumanString(program_shape.result()));
 }
 
-/* static */ StatusOr<Shape> ShapeUtil::ParseShapeString(const string& s) {
+namespace {
+// Parses shapes with simple recursive descent structure -- consumes from the
+// front of s and passes that view recursively as required.
+StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
+  tensorflow::str_util::RemoveLeadingWhitespace(s);
+
+  if (s->Consume("(")) {  // Tuple.
+    std::vector<Shape> shapes;
+    bool must_end = false;
+    while (true) {
+      if (s->Consume(")")) {
+        break;
+      } else if (must_end) {
+        return InvalidArgument("Expected end of tuple; got: \"%s\"",
+                               s->ToString().c_str());
+      }
+      shapes.emplace_back();
+      TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
+      tensorflow::str_util::RemoveLeadingWhitespace(s);
+      must_end = !s->Consume(",");
+    }
+    return ShapeUtil::MakeTupleShape(shapes);
+  }
+
   string element_type_string;
   string dimensions_string;
   string layout_string;
-  if (RE2::FullMatch(s, "([fsu]32)\\[([\\d,]*)\\](?: {([\\d,]*)})?",
-                     &element_type_string, &dimensions_string,
-                     &layout_string)) {
+  // tensorflow::StringPiece is not compatible with internal RE2 StringPiece, so
+  // we convert in to the RE2-consumable type and then consume the corresponding
+  // amount from our StringPiece type.
+  tensorflow::RegexpStringPiece s_consumable(s->data(), s->size());
+  if (RE2::Consume(&s_consumable,
+                   "^(\\w*\\d*)\\[([\\d,]*)\\](?:\\s*{([\\d,]*)})?",
+                   &element_type_string, &dimensions_string, &layout_string)) {
+    size_t consumed = s->size() - s_consumable.size();
+    s->remove_prefix(consumed);
     auto comma_list_to_int64s =
         [&s](const string& input) -> StatusOr<std::vector<int64>> {
       std::vector<int64> results;
@@ -439,39 +468,58 @@ const string& LowercasePrimitiveTypeName(PrimitiveType s) {
         int64 element;
         if (!tensorflow::strings::safe_strto64(piece.c_str(), &element)) {
           return InvalidArgument(
-              "invalid value in parsed shape string: \"%s\" in \"%s\"",
-              piece.c_str(), s.c_str());
+              "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"",
+              piece.c_str(), s->ToString().c_str());
         }
         results.push_back(element);
       }
       return results;
     };
+
+    // Extract the dimensions.
     TF_ASSIGN_OR_RETURN(std::vector<int64> dimensions,
                         comma_list_to_int64s(dimensions_string));
-    PrimitiveType primitive_type;
-    if (element_type_string == "f32") {
-      primitive_type = F32;
-    } else if (element_type_string == "s32") {
-      primitive_type = S32;
-    } else if (element_type_string == "u32") {
-      primitive_type = U32;
-    } else {
-      LOG(FATAL) << "unhandled element type string: " << element_type_string;
+
+    // Extract the primitive element type.
+    PrimitiveType primitive_type = PRIMITIVE_TYPE_INVALID;
+    for (PrimitiveType i =
+             static_cast<PrimitiveType>(PRIMITIVE_TYPE_INVALID + 1);
+         i < TUPLE; i = static_cast<PrimitiveType>(i + 1)) {
+      if (tensorflow::str_util::Lowercase(PrimitiveType_Name(i)) ==
+          element_type_string) {
+        primitive_type = i;
+        break;
+      }
     }
+    if (primitive_type == PRIMITIVE_TYPE_INVALID) {
+      return InvalidArgument("Invalid element type string: \"%s\".",
+                             element_type_string.c_str());
+    }
+
     Shape result;
     if (layout_string.empty()) {
-      result = MakeShape(primitive_type, dimensions);
+      // Create a shape without a layout set.
+      result = ShapeUtil::MakeShape(primitive_type, dimensions);
     } else {
+      // Extract the layout minor-to-major and set it.
       TF_ASSIGN_OR_RETURN(std::vector<int64> min2maj,
                           comma_list_to_int64s(layout_string));
       TF_RET_CHECK(dimensions.size() == min2maj.size());
-      result = MakeShapeWithLayout(primitive_type, dimensions, min2maj);
+      result =
+          ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions, min2maj);
     }
-    TF_DCHECK_OK(ValidateShape(result));
-    return result;
+    TF_DCHECK_OK(ShapeUtil::ValidateShape(result));
+    return std::move(result);
   }
 
-  return InvalidArgument("invalid shape string to parse: \"%s\"", s.c_str());
+  return InvalidArgument("Invalid shape string to parse: \"%s\"",
+                         s->ToString().c_str());
+}
+}  // namespace
+
+/* static */ StatusOr<Shape> ShapeUtil::ParseShapeString(
+    tensorflow::StringPiece s) {
+  return ParseShapeStringInternal(&s);
 }
 
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index f0058a6ed39..e3473138376 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -125,7 +125,7 @@ class ShapeUtil {
 
   // Parses a ShapeUtil::HumanString-format shape string back into a shape
   // object.
-  static StatusOr<Shape> ParseShapeString(const string& s);
+  static StatusOr<Shape> ParseShapeString(tensorflow::StringPiece s);
 
   // Returns whether the LHS and RHS shapes have the same dimensions; note: does
   // not check element type.
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 69ef6175ccd..9635e5ad2eb 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -78,6 +78,30 @@ TEST(ShapeUtilTest, ParseShapeStringR2F32) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
+TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) {
+  string shape_string = "(f32[1572864],s8[5120,1024])";
+  Shape actual = ShapeUtil::ParseShapeString(shape_string).ValueOrDie();
+  Shape expected =
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
+                                 ShapeUtil::MakeShape(S8, {5120, 1024})});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST(ShapeUtilTest, ParseShapeStringNestedTuple) {
+  string shape_string = "(f32[1],(f32[2]), f32[3])";
+  Shape actual = ShapeUtil::ParseShapeString(shape_string).ValueOrDie();
+  Shape expected = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {1}),
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2})}),
+      ShapeUtil::MakeShape(F32, {3}),
+  });
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
 TEST(ShapeUtilTest, CompatibleIdenticalShapes) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {3, 2});
   Shape shape2 = ShapeUtil::MakeShape(F32, {3, 2});

From 4ff256968b1cd5bdcf8188f4a4738e8dbe4936f3 Mon Sep 17 00:00:00 2001
From: Chris Leary <leary@google.com>
Date: Sat, 19 Aug 2017 18:23:18 -0700
Subject: [PATCH 58/70] [XLA] Enhance replay computation tool to do fake
 infeed.

PiperOrigin-RevId: 165829337
---
 tensorflow/compiler/xla/client/lib/testing.cc | 74 ++++++++++++-------
 tensorflow/compiler/xla/client/lib/testing.h  |  4 +
 .../compiler/xla/tools/replay_computation.cc  | 40 +++++++++-
 3 files changed, 86 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 4df513eb5cd..482d53cf330 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -48,38 +48,56 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
 
 }  // namespace
 
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    std::vector<std::unique_ptr<Literal>> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
+                          MakeFakeLiteral(element_shape));
+      elements.push_back(std::move(element));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  }
+  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  std::minstd_rand0 engine;
+  switch (shape.element_type()) {
+    case F32: {
+      std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+      TF_CHECK_OK(literal->Populate<float>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(engine);
+          }));
+      break;
+    }
+    case S32: {
+      std::uniform_int_distribution<int32> generator(
+          std::numeric_limits<int32>::lowest(),
+          std::numeric_limits<int32>::max());
+      TF_CHECK_OK(literal->Populate<int32>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(engine);
+          }));
+      break;
+    }
+    default:
+      return Unimplemented("Unsupported type for fake literal generation: %s",
+                           ShapeUtil::HumanString(shape).c_str());
+  }
+  return std::move(literal);
+}
+
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client) {
   if (ShapeUtil::ByteSizeOf(shape) < (1LL << 30)) {
-    std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-    std::minstd_rand0 engine;
-    switch (shape.element_type()) {
-      case F32: {
-        std::uniform_real_distribution<float> generator(0.0f, 1.0f);
-        TF_CHECK_OK(literal->Populate<float>(
-            [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-              return generator(engine);
-            }));
-        break;
-      }
-      case S32: {
-        std::uniform_int_distribution<int32> generator(
-            std::numeric_limits<int32>::lowest(),
-            std::numeric_limits<int32>::max());
-        TF_CHECK_OK(literal->Populate<int32>(
-            [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-              return generator(engine);
-            }));
-        break;
-      }
-      default:
-        LOG(WARNING)
-            << "Unsupported type for host-side fake data generation: "
-            << ShapeUtil::HumanString(shape)
-            << "; falling back to making small amount of fake data via device.";
-        return MakeFakeDataViaDeviceOrDie(shape, client);
+    StatusOr<std::unique_ptr<Literal>> literal_status = MakeFakeLiteral(shape);
+    if (!literal_status.ok()) {
+      // If we got an Unimplemented error, fall back to making the fake data via
+      // an on-device computation.
+      CHECK_EQ(literal_status.status().code(),
+               tensorflow::error::UNIMPLEMENTED);
+      return MakeFakeDataViaDeviceOrDie(shape, client);
     }
-    return client->TransferToServer(*literal).ValueOrDie();
+    return client->TransferToServer(*literal_status.ValueOrDie()).ValueOrDie();
   }
 
   // If the data is large, generate it on-device.
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 7e640d1307e..b5c4393dcc3 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -26,6 +26,10 @@ limitations under the License.
 
 namespace xla {
 
+// Generates fake data in a literal of the given shape, or returns an error
+// status if the element type is currently unhandled for fake data generation.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
+
 // Generates fake data of the given shape on the device or dies. The fake data
 // is created by performing a computation on the device rather than transferring
 // data from the host to the device.
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 735c66e2d3e..bd93e114b73 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -55,11 +56,16 @@ limitations under the License.
 
 namespace xla {
 namespace tools {
+namespace {
 
 // Invokes the given computation passing arbitrary data for every (unbound)
 // parameter if use_fake_data, Otherwise use recorded data if available.
+//
+// Similarly, infeeds fake data of shape fake_infeed_shape if it is provided;
+// otherwise, no infeed is performed.
 StatusOr<std::unique_ptr<Literal>> ReplayComputation(
-    const SessionModule& module, bool use_fake_data, Client* client) {
+    const SessionModule& module, tensorflow::StringPiece fake_infeed_shape,
+    bool use_fake_data, Client* client) {
   TF_ASSIGN_OR_RETURN(Computation computation, client->LoadSnapshot(module));
 
   std::vector<std::unique_ptr<GlobalData>> arguments;
@@ -74,6 +80,27 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
     }
   }
 
+  // We only instantiate the thread pool if the user has requested that a
+  // concurrent infeed occur via the fake_infeed_shape.
+  tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
+
+  if (!fake_infeed_shape.empty()) {
+    pool.emplace(tensorflow::Env::Default(), "infeed",
+                 /*num_threads=*/1);
+    pool->Schedule([fake_infeed_shape, client]() {
+      StatusOr<Shape> shape_status =
+          ShapeUtil::ParseShapeString(fake_infeed_shape);
+      TF_CHECK_OK(shape_status.status());
+      Shape shape = std::move(shape_status).ValueOrDie();
+      StatusOr<std::unique_ptr<Literal>> data_status = MakeFakeLiteral(shape);
+      TF_CHECK_OK(data_status.status());
+      std::unique_ptr<Literal> data = std::move(data_status).ValueOrDie();
+      while (true) {
+        TF_CHECK_OK(client->TransferToInfeed(*data));
+      }
+    });
+  }
+
   std::vector<GlobalData*> execute_arguments;
   execute_arguments.reserve(arguments.size());
   for (auto& argument : arguments) {
@@ -82,7 +109,8 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   return client->ExecuteAndTransfer(computation, execute_arguments);
 }
 
-int RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
+int RealMain(tensorflow::gtl::ArraySlice<char*> args,
+             tensorflow::StringPiece fake_infeed_shape, bool use_fake_data) {
   Client* client = ClientLibrary::LocalClientOrDie();
   tensorflow::Env* env = tensorflow::Env::Default();
   int exit_status = EXIT_SUCCESS;
@@ -90,7 +118,7 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
     SessionModule module;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(env, arg, &module));
     StatusOr<std::unique_ptr<Literal>> result_status =
-        ReplayComputation(module, use_fake_data, client);
+        ReplayComputation(module, fake_infeed_shape, use_fake_data, client);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
@@ -110,15 +138,19 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, bool use_fake_data) {
   return exit_status;
 }
 
+}  // namespace
 }  // namespace tools
 }  // namespace xla
 
 int main(int argc, char** argv) {
   // Flags
+  string fake_infeed_shape;
   bool use_fake_data = false;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("use_fake_data", &use_fake_data,
                        "Replay computation using fake data"),
+      tensorflow::Flag("fake_infeed_shape", &fake_infeed_shape,
+                       "Shape of fake data to construct for (infinite) infeed"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -129,5 +161,5 @@ int main(int argc, char** argv) {
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  return xla::tools::RealMain(args, use_fake_data);
+  return xla::tools::RealMain(args, fake_infeed_shape, use_fake_data);
 }

From 6fece75e4c21371fee59c5f701b47f4a0a946609 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Sat, 19 Aug 2017 19:14:53 -0700
Subject: [PATCH 59/70] Reshape: Add 64-bit output kernel.

PiperOrigin-RevId: 165830834
---
 tensorflow/core/kernels/reshape_op.cc         | 33 ++++++++++
 tensorflow/core/kernels/reshape_op.h          | 63 +++++++++++++------
 .../python/kernel_tests/reshape_op_test.py    |  7 +++
 3 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 16db8a6bb13..18ebf70c173 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -23,6 +23,11 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                             .HostMemory("shape")
                             .TypeConstraint<int32>("Tshape"),
                         ReshapeOp);
+REGISTER_KERNEL_BUILDER(Name("Reshape")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("shape")
+                            .TypeConstraint<int64>("Tshape"),
+                        ReshapeOp);
 
 #define REGISTER_GPU_KERNEL(type)                               \
   REGISTER_KERNEL_BUILDER(Name("Reshape")                       \
@@ -30,6 +35,12 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                               .HostMemory("shape")              \
                               .TypeConstraint<type>("T")        \
                               .TypeConstraint<int32>("Tshape"), \
+                          ReshapeOp);                           \
+  REGISTER_KERNEL_BUILDER(Name("Reshape")                       \
+                              .Device(DEVICE_GPU)               \
+                              .HostMemory("shape")              \
+                              .TypeConstraint<type>("T")        \
+                              .TypeConstraint<int64>("Tshape"), \
                           ReshapeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
@@ -42,6 +53,12 @@ REGISTER_GPU_KERNEL(bool);
                               .HostMemory("shape")              \
                               .TypeConstraint<type>("T")        \
                               .TypeConstraint<int32>("Tshape"), \
+                          ReshapeOp);                           \
+  REGISTER_KERNEL_BUILDER(Name("Reshape")                       \
+                              .Device(DEVICE_SYCL)              \
+                              .HostMemory("shape")              \
+                              .TypeConstraint<type>("T")        \
+                              .TypeConstraint<int64>("Tshape"), \
                           ReshapeOp);
 REGISTER_SYCL_KERNEL(float)
 REGISTER_SYCL_KERNEL(double)
@@ -58,6 +75,14 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int32>("Tshape"),
                         ReshapeOp);
+REGISTER_KERNEL_BUILDER(Name("Reshape")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("tensor")
+                            .HostMemory("shape")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tshape"),
+                        ReshapeOp);
 #undef REGISTER_SYCL_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
@@ -73,6 +98,14 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                             .TypeConstraint<int32>("T")
                             .TypeConstraint<int32>("Tshape"),
                         ReshapeOp);
+REGISTER_KERNEL_BUILDER(Name("Reshape")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("tensor")
+                            .HostMemory("shape")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tshape"),
+                        ReshapeOp);
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h
index 246951ed287..5db2d148b94 100644
--- a/tensorflow/core/kernels/reshape_op.h
+++ b/tensorflow/core/kernels/reshape_op.h
@@ -37,32 +37,28 @@ class ReshapeOp : public OpKernel {
     const Tensor& sizes = context->input(1);
     // Preliminary validation of sizes.
     OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
-                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                errors::InvalidArgument("sizes input must be 1-D, not ",
                                         sizes.shape().DebugString()));
-    const int64 num_dims = sizes.NumElements();
 
     // Compute the output shape.  Determine product of specified
     // dimensions, and find the index of the unspecified one.
     TensorShape shape;
     int64 product = 1;
     int unknown_index = -1;
-    auto Svec = sizes.flat<int32>();
-    for (int d = 0; d < num_dims; ++d) {
-      const int32 size = Svec(d);
-      if (size == -1) {
-        OP_REQUIRES(
-            context, unknown_index == -1,
-            errors::InvalidArgument("only one input size may be -1, not both ",
-                                    unknown_index, " and ", d));
-        unknown_index = d;
-        shape.AddDim(1);
-      } else {
-        OP_REQUIRES(context, size >= 0,
-                    errors::InvalidArgument(
-                        "size ", d, " must be non-negative, not ", size));
-        shape.AddDim(size);
-        product *= size;
-      }
+    switch (sizes.dtype()) {
+      case DT_INT32:
+        OP_REQUIRES_OK(context, ValidateSizes<int32>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      case DT_INT64:
+        OP_REQUIRES_OK(context, ValidateSizes<int64>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
+            DataTypeString(sizes.dtype())));
+        return;
     }
     if (unknown_index != -1) {
       OP_REQUIRES(
@@ -92,6 +88,35 @@ class ReshapeOp : public OpKernel {
   }
 
   bool IsExpensive() override { return false; }
+
+ private:
+  template <typename Tshape>
+  Status ValidateSizes(const Tensor& sizes, int64* product, int* unknown_index,
+                       TensorShape* shape) {
+    *product = 1;
+    *unknown_index = -1;
+    const int64 num_dims = sizes.NumElements();
+    auto Svec = sizes.flat<Tshape>();
+    for (int d = 0; d < num_dims; ++d) {
+      const Tshape size = Svec(d);
+      if (size == -1) {
+        if (*unknown_index != -1) {
+          return errors::InvalidArgument(
+              "Only one input size may be -1, not both ", *unknown_index,
+              " and ", d);
+        }
+        *unknown_index = d;
+        shape->AddDim(1);
+      } else if (size < 0) {
+        return errors::InvalidArgument("Size ", d,
+                                       " must be non-negative, not ", size);
+      } else {
+        shape->AddDim(size);
+        (*product) *= size;
+      }
+    }
+    return Status::OK();
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index 9d6e7e60a4b..ef9b4392307 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -37,6 +37,13 @@ class ReshapeTest(test.TestCase):
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
+      # Repeat with an int64 shape tensor.
+      y64 = constant_op.constant(y, dtype=dtypes.int64)
+      tf_ans = array_ops.reshape(x, y64)
+      out = tf_ans.eval()
+      self.assertEqual(tf_ans.get_shape(), out.shape)
+      self.assertShapeEqual(np_ans, tf_ans)
+
   def _testBothReshape(self, x, y):
     self._testReshape(x, y, False)
     self._testReshape(x, y, True)

From b315df8e5cc3b6e3bb78ff108250f537e1096c2d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 20 Aug 2017 10:38:10 -0700
Subject: [PATCH 60/70] Fixes comment in lookup_ops.py referencing previous
 contrib location.

PiperOrigin-RevId: 165861950
---
 tensorflow/python/ops/lookup_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index e0cf8bc5e14..ab9e3b0af9b 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -236,9 +236,9 @@ class HashTable(InitializableLookupTableBase):
   Example usage:
 
   ```python
-  table = tf.contrib.lookup.HashTable(
-      tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
-  out = table.lookup(input_tensor).
+  table = tf.HashTable(
+      tf.KeyValueTensorInitializer(keys, values), -1)
+  out = table.lookup(input_tensor)
   table.init.run()
   print(out.eval())
   ```

From badd5456977e2b981a08cd5d6e41a292ea6eafda Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 20 Aug 2017 19:36:21 -0700
Subject: [PATCH 61/70] Add a dependency to yet another build independent
 tensorflow build system.

As well as bazel, make, cmake and gradle builds, plus Google's internal version
of bazel, tensorflow/tools/ci_build/builds/test_user_ops.sh is a shell script
that also builds parts of tensorflow.

This change fixes the dependencies it uses.

PiperOrigin-RevId: 165880592
---
 tensorflow/docs_src/extend/adding_an_op.md        | 6 +++---
 tensorflow/tools/ci_build/builds/test_user_ops.sh | 7 ++++---
 tensorflow/tools/pip_package/setup.py             | 3 ++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index ca6bb2f4cdb..424648d54ab 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -329,7 +329,7 @@ to compile your op into a dynamic library.
 ```bash
 TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
 
-g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I $TF_INC -O2
+g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I$TF_INC -I$TF_INC/external/nsync/public -O2
 ```
 
 On Mac OS X, the additional flag "-undefined dynamic_lookup" is required when
@@ -1198,10 +1198,10 @@ into a single dynamically loadable library:
 
 ```bash
 nvcc -std=c++11 -c -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \
--I $TF_INC -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
+-I $TF_INC -I$TF_INC/external/nsync/public -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
 
 g++ -std=c++11 -shared -o cuda_op_kernel.so cuda_op_kernel.cc \
-cuda_op_kernel.cu.o -I $TF_INC -fPIC -lcudart
+cuda_op_kernel.cu.o -I $TF_INC -I$TF_INC/external/nsync/public -fPIC -lcudart
 ```
 
 `cuda_op_kernel.so` produced above can be loaded as usual in Python, using the
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 3b7e2348ad1..c3800cc256c 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -84,6 +84,7 @@ if [[ -z "${TF_INC}" ]]; then
   die "FAILED to determine TensorFlow include path"
 else
   echo "TensorFlow include path: ${TF_INC}"
+  TF_INCLUDE_PATH="-I${TF_INC} -I${TF_INC}/external/nsync/public"
 fi
 
 # Check g++ availability
@@ -142,7 +143,7 @@ if [[ ${IS_GPU} == "0" ]]; then
 
   "${GPP_BIN}" -std=c++11 ${EXTRA_GPP_FLAGS} \
     -shared "${SRC_FILE}" -o "${USER_OP_SO}" \
-    -fPIC -I "${TF_INC}" || \
+    -fPIC ${TF_INCLUDE_PATH} || \
     die "g++ compilation of ${SRC_FILE} FAILED"
 
 else
@@ -181,7 +182,7 @@ else
   OP_KERNEL_O=$(echo "${OP_KERNEL_CC}" | sed -e 's/\.cc/\.o/')
   "${NVCC_BIN}" -std=c++11 \
       -c -o "${OP_KERNEL_O}" "${OP_KERNEL_CU}" \
-      -I "${TF_INC}" -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC || \
+      ${TF_INCLUDE_PATH} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC || \
       die "nvcc compilation of ${OP_KERNEL_CC} FAILED"
 
   CUDA_LIB_DIR="/usr/local/cuda/lib64"
@@ -200,7 +201,7 @@ else
   USER_OP_SO="add_one.so"
   "${GPP_BIN}" -std=c++11 ${EXTRA_GPP_FLAGS} \
       -shared -o "${USER_OP_SO}" "${OP_KERNEL_CC}" \
-      "${OP_KERNEL_O}" -I "${TF_INC}" -L "${CUDA_LIB_DIR}" \
+      "${OP_KERNEL_O}" ${TF_INCLUDE_PATH} -L "${CUDA_LIB_DIR}" \
       -fPIC -lcudart || \
       die "g++ compilation of ${OP_KERNEL_CC}" FAILED
 fi
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 7a5dbbb75e7..0e93fd27a46 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -172,7 +172,8 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*.h', 'tensorflow/stream_executor')) +
            list(find_files('*.h', 'google/protobuf_archive/src')) +
            list(find_files('*', 'third_party/eigen3')) +
-           list(find_files('*', 'external/eigen_archive')))
+           list(find_files('*', 'external/eigen_archive')) +
+           list(find_files('*.h', 'external/nsync/public')))
 
 
 setup(

From ec0e1e580c1eb46afd5a81af8f925d8813e7ab50 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 20 Aug 2017 22:25:23 -0700
Subject: [PATCH 62/70] Automated g4 rollback of changelist 165773305

PiperOrigin-RevId: 165887626
---
 tensorflow/contrib/cmake/tf_tests.cmake       |   2 -
 tensorflow/core/BUILD                         |   2 -
 tensorflow/core/kernels/BUILD                 |   9 +-
 tensorflow/core/kernels/l2loss_op.cc          |  39 +-
 tensorflow/core/kernels/l2loss_op.h           |  16 +-
 tensorflow/core/kernels/l2loss_op_gpu.cu.cc   |  49 +-
 tensorflow/core/kernels/reduction_ops.h       |   3 +-
 .../core/kernels/reduction_ops_common.h       |  15 +-
 .../core/kernels/reduction_ops_gpu.cu.cc      | 210 +-----
 .../core/kernels/reduction_ops_gpu_kernels.h  | 697 ------------------
 tensorflow/core/kernels/reduction_ops_test.cc | 163 +---
 .../core/util/permutation_input_iterator.h    | 134 ----
 .../core/util/transform_output_iterator.h     | 149 ----
 tensorflow/python/kernel_tests/BUILD          |  20 -
 .../python/kernel_tests/reduction_ops_test.py |  18 -
 .../kernel_tests/reduction_ops_test_big.py    |  75 --
 16 files changed, 115 insertions(+), 1486 deletions(-)
 delete mode 100644 tensorflow/core/kernels/reduction_ops_gpu_kernels.h
 delete mode 100644 tensorflow/core/util/permutation_input_iterator.h
 delete mode 100644 tensorflow/core/util/transform_output_iterator.h
 delete mode 100644 tensorflow/python/kernel_tests/reduction_ops_test_big.py

diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 6507a9a5e07..25f00de81dd 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -289,8 +289,6 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Failing with TF 1.3 (TODO)
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py"
-      # Test should only be run manually
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reduction_ops_test_big.py"
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 49b1589929c..1f7eb87f18b 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -412,7 +412,6 @@ tf_cuda_library(
         "util/guarded_philox_random.h",
         "util/mirror_pad_mode.h",
         "util/padding.h",
-        "util/permutation_input_iterator.h",
         "util/port.h",
         "util/saved_tensor_slice_util.h",
         "util/sparse/group_iterator.h",
@@ -424,7 +423,6 @@ tf_cuda_library(
         "util/tensor_slice_reader.h",
         "util/tensor_slice_reader_cache.h",
         "util/tensor_slice_writer.h",
-        "util/transform_output_iterator.h",
         "util/use_cudnn.h",
         "util/matmul_autotune.h",
         "util/util.h",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 7dd56247f4f..9f638eebee4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2532,9 +2532,8 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "reduction_ops",
-    srcs = ["reduction_ops_gpu_kernels.h"],
     prefix = "reduction_ops",
-    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
+    deps = MATH_DEPS,
 )
 
 tf_kernel_library(
@@ -2995,16 +2994,14 @@ tf_kernel_library(
 tf_kernel_library(
     name = "l2loss_op",
     prefix = "l2loss_op",
-    #srcs = ["reduction_ops_gpu_kernels.h"],
     deps = [
-        ":reduction_ops",
-        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
         "//tensorflow/core:nn_ops_op_lib",
-    ] + if_cuda(["@cub_archive//:cub"]),
+        "//third_party/eigen3",
+    ],
 )
 
 tf_cuda_cc_test(
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index f8ed9351579..9875cd027d5 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -27,9 +27,10 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T>
-class L2LossOp<CPUDevice, T> : public OpKernel {
+template <typename Device, typename T>
+class L2LossOp : public OpKernel {
  public:
   explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
 
@@ -41,9 +42,8 @@ class L2LossOp<CPUDevice, T> : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({}), &output));
-    const CPUDevice& d = context->eigen_device<CPUDevice>();
-    output->scalar<T>().device(d) =
-        (input.flat<T>().square() * static_cast<T>(0.5)).sum();
+    functor::L2Loss<Device, T>()(context->eigen_device<Device>(),
+                                 input.flat<T>(), output->scalar<T>());
   }
 };
 
@@ -57,4 +57,33 @@ REGISTER_KERNEL(double);
 REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                    \
+  template <>                                                                  \
+  void L2Loss<GPUDevice, T>::operator()(const GPUDevice& d,                    \
+                                        typename TTypes<T>::ConstTensor input, \
+                                        typename TTypes<T>::Scalar output);    \
+  extern template struct L2Loss<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+DECLARE_GPU_SPEC(Eigen::half);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      L2LossOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(Eigen::half);
+#undef REGISTER_GPU_KERNEL
+
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h
index 4953aa237cd..f7204cefdd4 100644
--- a/tensorflow/core/kernels/l2loss_op.h
+++ b/tensorflow/core/kernels/l2loss_op.h
@@ -15,19 +15,25 @@ limitations under the License.
 
 #ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_
 #define TENSORFLOW_KERNELS_L2LOSS_OP_H_
+// Functor definition for L2LossOp, must be compilable by nvcc.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
+namespace functor {
 
+// Functor used by L2LossOp to do the computations.
 template <typename Device, typename T>
-struct L2LossOp : public OpKernel {
-  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) {}
+struct L2Loss {
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor input,
+                  typename TTypes<T>::Scalar output) {
+    // We flatten the input tensor and reduce on dimension 0, producing
+    // a single number which is Mul(Sum(x^2), 0.5).
+    output.device(d) = (input.square() * static_cast<T>(0.5)).sum();
+  }
 };
 
+}  // namespace functor
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_L2LOSS_OP_H_
diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
index 73b6472254c..420df370865 100644
--- a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc
@@ -21,55 +21,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 
-#include "tensorflow/core/kernels/reduction_ops_common.h"
-#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h"
-
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
-
-// TODO(eriche): can add specialization for half2
-template <typename T>
-struct squareHalf {
-  __host__ __device__ T operator()(const T& x) const {
-    return static_cast<T>(0.5) * x * x;
-  }
-};
-
-template <typename T>
-class L2LossOp<GPUDevice, T> : public OpKernel {
- public:
-  explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // The input tensor can be of any number of dimensions, even though it's
-    // 2D in most typical applications.
-    const Tensor& input = context->input(0);
-    // The output is a single number.
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-    typedef cub::TransformInputIterator<T, squareHalf<T>, T*> inputIterType;
-    inputIterType input_itr((T*)input.flat<T>().data(), squareHalf<T>());
-    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
-
-    Constants<GPUDevice> constants;
-    functor::ReduceImpl<T, cub::Sum, T*, inputIterType, ReductionAxes>(
-        context, (T*)output->flat<T>().data(), input_itr, 1,
-        input.flat<T>().size(), 1, 1, 0, constants.kZero, cub::Sum(), T(0));
-  }
-};
-
-// Registration of the GPU implementations.
-#define REGISTER_GPU_KERNEL(T)                                  \
-  REGISTER_KERNEL_BUILDER(                                      \
-      Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      L2LossOp<GPUDevice, T>);
-
-REGISTER_GPU_KERNEL(float);
-REGISTER_GPU_KERNEL(double);
-REGISTER_GPU_KERNEL(Eigen::half);
-#undef REGISTER_GPU_KERNEL
+template struct functor::L2Loss<GPUDevice, float>;
+template struct functor::L2Loss<GPUDevice, double>;
+template struct functor::L2Loss<GPUDevice, Eigen::half>;
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index e43d2828f30..5db9e6032e0 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <iostream>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
@@ -68,7 +67,7 @@ void FillIdentityEigenImpl(const Device& d, OUT_T out, const Reducer& reducer) {
 template <typename Device, typename Reducer>
 struct ReduceFunctor {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+  static void Reduce(const Device& d, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Reducer& reducer);
 
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 71af9d88dc1..553f8895232 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -190,24 +190,24 @@ class ReductionOp : public OpKernel {
       Functor::FillIdentity(d, tmp_out.flat<T>(), reducer);
     } else if ((helper.ndims() == 1) && helper.reduce_first_axis()) {
       // Reduce to a scalar.
-      Functor::Reduce(ctx, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
+      Functor::Reduce(d, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data),
                       constants.kZero, reducer);
     } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a matrix along 1st dimension.
-      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
                       constants.kZero, reducer);
     } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a matrix along 2nd dimension.
-      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
+      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data),
                       constants.kOne, reducer);
     } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a 3D tensor along 1st and 3rd
       // dimensions.
-      Functor::Reduce(ctx, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
+      Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data),
                       constants.kZeroTwo, reducer);
     } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) {
       // Can be viewed as a reduction of a 3D tensor along 2nd dimension.
-      Functor::Reduce(ctx, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
+      Functor::Reduce(d, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data),
                       constants.kOne, reducer);
     } else {
       // If we don't hit one of the cases above, transpose the data so that
@@ -223,7 +223,7 @@ class ReductionOp : public OpKernel {
       const int64 unreduced = tmp_out.NumElements();
       const int64 reduced = shuffled.NumElements() / unreduced;
       const Tensor& const_shuffled = shuffled;
-      Functor::Reduce(ctx, tmp_out.flat<T>(),
+      Functor::Reduce(d, tmp_out.flat<T>(),
                       const_shuffled.shaped<T, 2>({unreduced, reduced}),
                       constants.kOne, reducer);
     }
@@ -258,10 +258,9 @@ namespace functor {
 template <typename Device, typename Reducer>
 struct ReduceFunctorBase {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+  static void Reduce(const Device& d, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Reducer& reducer) {
-    const Device& d = ctx->eigen_device<Device>();
     ReduceEigenImpl(d, out, in, reduction_axes, reducer);
   }
 
diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
index cff0e95bc13..ec4490db83f 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
@@ -17,7 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/reduction_ops_gpu_kernels.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/kernels/reduction_ops.h"
 
 namespace tensorflow {
 namespace functor {
@@ -32,27 +33,15 @@ typedef TTypes<float>::Tensor::Index Index;
 template <typename Reducer>
 struct ReduceFunctor<GPUDevice, Reducer> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+  static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
-                     const Reducer& reducer);
-};
-
-template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
-  template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::SumReducer<T>& reducer) {
-    ReduceImpl<T, cub::Sum, T*, T*, ReductionAxes>(
-        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
-        in.rank() >= 2 ? in.dimension(1) : 1,
-        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
-        cub::Sum(), T(0));
+                     const Reducer& reducer) {
+    ReduceEigenImpl(d, To32Bit(out), To32Bit(in), reduction_axes, reducer);
   }
 
   template <typename OUT_T>
   static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::SumReducer<T>& reducer) {
+                           const Reducer& reducer) {
     FillIdentityEigenImpl(d, To32Bit(out), reducer);
   }
 };
@@ -60,30 +49,19 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
 template <typename T>
 struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+  static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
                      const Eigen::internal::MeanReducer<T>& reducer) {
-    int divisor = 1;
-    if (out.rank() == 0)
-      divisor = in.size();
-    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
-      divisor = in.dimension(0);
-    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
-      divisor = in.dimension(1);
-    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
-             reduction_axes[1] == 2)
-      divisor = in.dimension(0) * in.dimension(2);
-    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
-      divisor = in.dimension(1);
-
-    DividesBy<T> div_op((T)divisor);
-    TransformOutputIterator<T, T, DividesBy<T>> itr((T*)out.data(), div_op);
-    ReduceImpl<T, cub::Sum, TransformOutputIterator<T, T, DividesBy<T>>, T*,
-               ReductionAxes>(ctx, itr, (T*)in.data(), in.rank(),
-                              in.dimension(0),
-                              in.rank() >= 2 ? in.dimension(1) : 1,
-                              in.rank() >= 3 ? in.dimension(2) : 1, out.rank(),
-                              reduction_axes, cub::Sum(), T(0));
+    typedef typename IN_T::Index Index;
+    // Eigen sum reductions are much faster on GPU than mean reductions:
+    // Simply trigger them by computing the sum of the weighted inputs.
+    Index num_coeffs_to_reduce = 1;
+    for (int i = 0; i < Eigen::internal::array_size<ReductionAxes>::value;
+         ++i) {
+      num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
+    }
+    T scale = T(1.0 / num_coeffs_to_reduce);
+    out.device(d) = (in * scale).sum(reduction_axes);
   }
 
   template <typename OUT_T>
@@ -93,159 +71,15 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   }
 };
 
-template <>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<Eigen::half>> {
-  template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
-    float divisor = 1.f;
-    if (out.rank() == 0)
-      divisor = in.size();
-    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 0)
-      divisor = in.dimension(0);
-    else if (out.rank() == 1 && in.rank() == 2 && reduction_axes[0] == 1)
-      divisor = in.dimension(1);
-    else if (out.rank() == 1 && in.rank() == 3 && reduction_axes[0] == 0 &&
-             reduction_axes[1] == 2)
-      divisor = in.dimension(0) * in.dimension(2);
-    else if (out.rank() == 2 && in.rank() == 3 && reduction_axes[0] == 1)
-      divisor = in.dimension(1);
-    DividesBy<float, Eigen::half> div_op(divisor);
-
-    typedef cub::TransformInputIterator<float, HalfToFloat, Eigen::half*>
-        inputIterType;
-    inputIterType input_itr((Eigen::half*)in.data(), HalfToFloat());
-
-    typedef TransformOutputIterator<Eigen::half, float,
-                                    DividesBy<float, Eigen::half>>
-        outputIterType;
-    outputIterType itr((Eigen::half*)out.data(), div_op);
-
-    ReduceImpl<float, cub::Sum, outputIterType, inputIterType, ReductionAxes>(
-        ctx, itr, input_itr, in.rank(), in.dimension(0),
-        in.rank() >= 2 ? in.dimension(1) : 1,
-        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
-        cub::Sum(), 0.f);
-  }
-
-  template <typename OUT_T>
-  static void FillIdentity(
-      const GPUDevice& d, OUT_T out,
-      const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
-    FillIdentityEigenImpl(d, To32Bit(out), reducer);
-  }
-};
-
-template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MaxReducer<T>> {
-  template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::MaxReducer<T>& reducer) {
-    ReduceImpl<T, cub::Max, T*, T*, ReductionAxes>(
-        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
-        in.rank() >= 2 ? in.dimension(1) : 1,
-        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
-        cub::Max(), std::numeric_limits<T>::min());
-  }
-
-  template <typename OUT_T>
-  static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::MaxReducer<T>& reducer) {
-    FillIdentityEigenImpl(d, To32Bit(out), reducer);
-  }
-};
-
-template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MinReducer<T>> {
-  template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::MinReducer<T>& reducer) {
-    ReduceImpl<T, cub::Min, T*, T*, ReductionAxes>(
-        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
-        in.rank() >= 2 ? in.dimension(1) : 1,
-        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
-        cub::Min(), std::numeric_limits<T>::max());
-  }
-
-  template <typename OUT_T>
-  static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::MinReducer<T>& reducer) {
-    FillIdentityEigenImpl(d, To32Bit(out), reducer);
-  }
-};
-
-template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::ProdReducer<T>> {
-  template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::ProdReducer<T>& reducer) {
-    ReduceImpl<T, Prod<T>, T*, T*, ReductionAxes>(
-        ctx, (T*)out.data(), (T*)in.data(), in.rank(), in.dimension(0),
-        in.rank() >= 2 ? in.dimension(1) : 1,
-        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
-        Prod<T>(), T(1));
-  }
-
-  template <typename OUT_T>
-  static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::ProdReducer<T>& reducer) {
-    FillIdentityEigenImpl(d, To32Bit(out), reducer);
-  }
-};
-
-template <>
-struct ReduceFunctor<GPUDevice, Eigen::internal::AndReducer> {
-  template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::AndReducer& reducer) {
-    ReduceImpl<bool, And, bool*, bool*, ReductionAxes>(
-        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
-        in.rank() >= 2 ? in.dimension(1) : 1,
-        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, And(),
-        true);
-  }
-
-  template <typename OUT_T>
-  static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::AndReducer& reducer) {
-    FillIdentityEigenImpl(d, To32Bit(out), reducer);
-  }
-};
-
-template <>
-struct ReduceFunctor<GPUDevice, Eigen::internal::OrReducer> {
-  template <typename OUT_T, typename IN_T, typename ReductionAxes>
-  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Eigen::internal::OrReducer& reducer) {
-    ReduceImpl<bool, Or, bool*, bool*, ReductionAxes>(
-        ctx, (bool*)out.data(), (bool*)in.data(), in.rank(), in.dimension(0),
-        in.rank() >= 2 ? in.dimension(1) : 1,
-        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes, Or(),
-        false);
-  }
-
-  template <typename OUT_T>
-  static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::OrReducer& reducer) {
-    FillIdentityEigenImpl(d, To32Bit(out), reducer);
-  }
-};
-
 // T: the data type
 // REDUCER: the reducer functor
 // NUM_AXES: the number of axes to reduce
 // IN_DIMS: the number of dimensions of the input tensor
-#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                          \
-  template void ReduceFunctor<GPUDevice, REDUCER>::Reduce(             \
-      OpKernelContext* ctx, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
-      TTypes<T, IN_DIMS>::ConstTensor in,                              \
-      const Eigen::array<Index, NUM_AXES>& reduction_axes,             \
+#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES)                        \
+  template void ReduceFunctor<GPUDevice, REDUCER>::Reduce(           \
+      const GPUDevice& d, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \
+      TTypes<T, IN_DIMS>::ConstTensor in,                            \
+      const Eigen::array<Index, NUM_AXES>& reduction_axes,           \
       const REDUCER& reducer);
 
 #define DEFINE_IDENTITY(T, REDUCER)                              \
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_kernels.h b/tensorflow/core/kernels/reduction_ops_gpu_kernels.h
deleted file mode 100644
index 45a9fdd6d99..00000000000
--- a/tensorflow/core/kernels/reduction_ops_gpu_kernels.h
+++ /dev/null
@@ -1,697 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "external/cub_archive/cub/device/device_reduce.cuh"
-#include "external/cub_archive/cub/device/device_segmented_reduce.cuh"
-#include "external/cub_archive/cub/iterator/counting_input_iterator.cuh"
-#include "external/cub_archive/cub/iterator/transform_input_iterator.cuh"
-#include "external/cub_archive/cub/warp/warp_reduce.cuh"
-#include "cuda/include/cuComplex.h"
-#include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/reduction_ops.h"
-#include "tensorflow/core/lib/core/bits.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
-#include "tensorflow/core/util/permutation_input_iterator.h"
-#include "tensorflow/core/util/transform_output_iterator.h"
-
-#include <sstream>
-
-namespace tensorflow {
-namespace functor {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-template <typename T>
-struct Prod {
-  __host__ __device__ T operator()(const T& a, const T& b) const {
-    return a * b;
-  }
-};
-
-// needed to work around a compiler bug in nvcc - it doesn't seem to like
-// the overloaded multiply op for std::complex
-template <>
-struct Prod<std::complex<float>> {
-  __host__ __device__ std::complex<float> operator()(
-      const std::complex<float>& a, const std::complex<float>& b) const {
-    auto result = cuCmulf(make_cuComplex(a.real(), a.imag()),
-                          make_cuComplex(b.real(), b.imag()));
-    return std::complex<float>(result.x, result.y);
-  }
-};
-
-template <>
-struct Prod<std::complex<double>> {
-  __host__ __device__ std::complex<double> operator()(
-      const std::complex<double>& a, const std::complex<double>& b) const {
-    auto result = cuCmul(make_cuDoubleComplex(a.real(), a.imag()),
-                         make_cuDoubleComplex(b.real(), b.imag()));
-    return std::complex<double>(result.x, result.y);
-  }
-};
-
-template <typename T, typename outT = T>
-struct DividesBy {
-  T divisor;
-
-  __host__ __device__ explicit DividesBy(T divisor) : divisor(divisor) {}
-
-  __host__ __device__ outT operator()(const T& x) const { return x / divisor; }
-};
-
-// needed to work around a compiler bug in nvcc - it doesn't seem to like
-// the overloaded ops for std::complex
-template <>
-struct DividesBy<std::complex<float>> {
-  cuFloatComplex divisor;
-
-  __host__ __device__ explicit DividesBy(std::complex<float> divisor)
-      : divisor(make_cuComplex(divisor.real(), divisor.imag())) {}
-
-  // implements
-  __host__ __device__ std::complex<float> operator()(
-      const std::complex<float>& x) const {
-    auto result = cuCdivf(make_cuComplex(x.real(), x.imag()), divisor);
-    return std::complex<float>(result.x, result.y);
-  }
-};
-
-template <>
-struct DividesBy<std::complex<double>> {
-  cuDoubleComplex divisor;
-
-  __host__ __device__ explicit DividesBy(std::complex<double> divisor)
-      : divisor(make_cuDoubleComplex(divisor.real(), divisor.imag())) {}
-
-  // implements
-  __host__ __device__ std::complex<double> operator()(
-      const std::complex<double>& x) const {
-    auto result = cuCdiv(make_cuDoubleComplex(x.real(), x.imag()), divisor);
-    return std::complex<double>(result.x, result.y);
-  }
-};
-
-template <>
-struct DividesBy<float, Eigen::half> {
-  float divisor;
-
-  __host__ __device__ explicit DividesBy(float divisor) : divisor(divisor) {}
-
-  __host__ __device__ Eigen::half operator()(const float& x) const {
-    return Eigen::half(x / divisor);
-  }
-};
-
-struct HalfToFloat {
-  __host__ __device__ float operator()(const Eigen::half& x) const {
-    return Eigen::half_impl::half_to_float(x);
-  }
-};
-
-struct FloatToHalf {
-  __host__ __device__ Eigen::half operator()(const float& x) const {
-    return Eigen::half_impl::float_to_half_rtne(x);
-  }
-};
-
-struct And {
-  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
-    return a && b;
-  }
-};
-
-struct Or {
-  __host__ __device__ bool operator()(const bool& a, const bool& b) const {
-    return a || b;
-  }
-};
-
-// each block does a grid strided loop and reduces its values locally
-// the case of one block is used for low latency small reductions to scalars
-template <typename T, typename outT, int num_threads, typename Op>
-__global__ void BlockReduceKernel(T in, outT out, int num_elems, Op op) {
-  const int bid = blockIdx.x;
-  const int tid = threadIdx.x;
-
-  const int gid = bid * blockDim.x + tid;
-  const int stride = blockDim.x * gridDim.x;
-
-  typedef typename std::iterator_traits<T>::value_type value_type;
-
-  value_type sum;
-  if (gid < num_elems) {
-    sum = in[gid];
-    for (int pos = gid + stride; pos < num_elems; pos += stride) {
-      sum = op(sum, in[pos]);
-    }
-  } else
-    sum = value_type();  // stop compiler from complaining
-
-  typedef cub::BlockReduce<value_type, num_threads> BlockReduce;
-
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  __syncthreads();
-
-  sum = BlockReduce(temp_storage)
-            .template Reduce(sum, op, min(num_elems, num_threads));
-
-  if (tid == 0) out[bid] = sum;
-}
-
-// maps a warp to each row
-template <typename T, typename outT, typename Op>
-__global__ void RowReduceKernel(T in, outT out, int num_rows, int num_cols,
-                                Op op) {
-  typedef typename std::iterator_traits<T>::value_type value_type;
-  const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
-  const int lane = threadIdx.x % 32;
-
-  if (num_cols == 1) {
-    int gid = threadIdx.x + blockIdx.x * blockDim.x;
-    if (gid < num_rows) out[gid] = in[gid];
-    return;
-  }
-
-  value_type sum;
-  int col = lane;
-  if (row < num_rows && col < num_cols) {
-    sum = in[row * num_cols + col];
-    col += 32;
-    for (; col < num_cols; col += 32) {
-      sum = op(sum, in[row * num_cols + col]);
-    }
-  } else {
-    sum = value_type();  // stop compiler from complaining
-  }
-
-  typedef cub::WarpReduce<value_type> WarpReduce;
-
-  __shared__ typename WarpReduce::TempStorage temp_storage;
-
-  __syncthreads();
-
-  sum = WarpReduce(temp_storage).template Reduce(sum, op, min(num_cols, 32));
-
-  if (row < num_rows && lane == 0) out[row] = sum;
-}
-
-// Works only if there are <= 16 columns
-// each warps sums over multiple rows at once
-template <typename T, typename outT, typename Op>
-__global__ void ColumnReduceMax16ColumnsKernel(T in, outT out, int num_rows,
-                                               int num_cols, Op op) {
-  typedef typename std::iterator_traits<T>::value_type value_type;
-  int rows_per_warp = 32 / num_cols;
-
-  int lane = threadIdx.x % 32;
-  int lane_row = lane / num_cols;
-
-  const int start_row_warp =
-      rows_per_warp * (blockIdx.y * blockDim.y + threadIdx.y);
-  const int start_row_lane = start_row_warp + lane_row;
-  int row = start_row_lane;
-  int col = lane % num_cols;
-
-  value_type sum;
-  if (row * num_cols + col < num_rows * num_cols)
-    sum = in[row * num_cols + col];
-  else
-    sum = value_type();  // needed to shut up compiler
-
-  __shared__ value_type partial_sums[32][33];
-
-  __syncthreads();
-
-  row += rows_per_warp * gridDim.y * blockDim.y;
-  for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
-    int global_pos = row * num_cols + col;
-    if (global_pos < (num_rows * num_cols))
-      sum = op(sum, in[row * num_cols + col]);
-  }
-
-  const int rows_in_this_warp = min(rows_per_warp, num_rows - start_row_warp);
-  // not the most efficient way to do this sum
-  for (int i = 1; i < rows_in_this_warp; ++i) {
-    value_type tmp =
-        cub::ShuffleIndex(sum, threadIdx.x + i * num_cols, 32, 0xffffffff);
-    if (lane < num_cols) sum = op(sum, tmp);
-  }
-
-  if (lane < num_cols) partial_sums[lane][threadIdx.y] = sum;
-
-  __syncthreads();
-
-  if (threadIdx.y == 0 && threadIdx.x < num_cols) {
-    value_type s = partial_sums[threadIdx.x][0];
-
-    if (blockDim.y > 1) {
-      for (int row = 1; row < blockDim.y; ++row) {
-        s = op(s, partial_sums[threadIdx.x][row]);
-      }
-    }
-
-    out[col * gridDim.y + blockIdx.y] = s;
-  }
-}
-
-// Maps each block to a column range 32 wide
-template <typename T, typename outT, typename Op>
-__global__ void ColumnReduceKernel(T in, outT out, int num_rows, int num_cols,
-                                   Op op) {
-  typedef typename std::iterator_traits<T>::value_type value_type;
-  int row = blockIdx.y * blockDim.y + threadIdx.y;
-  int col = blockIdx.x * 32 + threadIdx.x;
-
-  value_type sum;
-  if (row * num_cols + col < num_rows * num_cols)
-    sum = in[row * num_cols + col];
-  else
-    sum = value_type();  // will never be used, needed to shut up compiler
-
-  __shared__ value_type partial_sums[32][33];
-
-  __syncthreads();
-
-  row += gridDim.y * blockDim.y;
-
-  if (col < num_cols) {
-    for (; row < num_rows; row += gridDim.y * blockDim.y) {
-      sum = op(sum, in[row * num_cols + col]);
-    }
-  }
-
-  partial_sums[threadIdx.x][threadIdx.y] = sum;
-
-  __syncthreads();
-
-  if (threadIdx.y == 0 && threadIdx.x < 32) {
-    value_type s = partial_sums[threadIdx.x][0];
-
-    for (int row = 1; row < blockDim.y; ++row) {
-      s = op(s, partial_sums[threadIdx.x][row]);
-    }
-
-    out[col * gridDim.y + blockIdx.y] = s;
-  }
-}
-
-// does multiple warp size segmented reductions in parallel
-// segments cannot cross warp boundaries (mainly used for reducing the segments
-// that come from the Max16Columns column reduction kernel)
-template <typename T, typename outT, typename Op>
-__global__ void CleanupSegments(T partial_sums, outT out, int num_rows,
-                                int num_cols, int segment_size, Op op) {
-  typedef typename std::iterator_traits<T>::value_type value_type;
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  value_type val;
-  if (tid < segment_size * num_cols)
-    val = partial_sums[tid];
-  else
-    val = value_type();  // 0s beyond last segment won't be used, so OK
-
-  typedef cub::WarpReduce<value_type> WarpReduce;
-
-  __shared__ typename WarpReduce::TempStorage temp_storage;
-
-  __syncthreads();
-
-  bool head_flag = (threadIdx.x % segment_size) == 0;
-  value_type sum =
-      WarpReduce(temp_storage).HeadSegmentedReduce(val, head_flag, op);
-
-  if (head_flag && tid < segment_size * num_cols) {
-    out[tid / segment_size] = sum;
-  }
-}
-
-// assigns one thread to a column
-template <typename T, typename outT, typename Op>
-__global__ void ColumnReduceSimpleKernel(T in, outT out, int num_planes,
-                                         int num_rows, int num_cols, Op op) {
-  typedef typename std::iterator_traits<T>::value_type value_type;
-  const int gid = threadIdx.x + blockIdx.x * blockDim.x;
-  const int elems_per_plane = num_rows * num_cols;
-
-  int plane = gid / num_cols;
-  int col = gid % num_cols;
-
-  if (plane >= num_planes) return;
-
-  if (num_rows == 1) {
-    out[plane * elems_per_plane + col] = in[plane * elems_per_plane + col];
-    return;
-  }
-
-  value_type sum = op(in[plane * elems_per_plane + col],
-                      in[plane * elems_per_plane + num_cols + col]);
-  for (int row = 2; row < num_rows; ++row) {
-    sum = op(sum, in[plane * elems_per_plane + row * num_cols + col]);
-  }
-
-  out[plane * num_cols + col] = sum;
-}
-
-struct RowOffset {
-  __host__ __device__ explicit RowOffset(const int& cols) : cols_(cols) {}
-
-  __host__ __device__ int operator()(const int& x) const { return cols_ * x; }
-
-  int cols_;
-};
-
-struct GatherOp {
-  __host__ __device__ GatherOp(const int& extent_x, const int& extent_y,
-                               const int& extent_z, bool kOne)
-      : extent_x_(extent_x),
-        extent_y_(extent_y),
-        extent_z_(extent_z),
-        kOne_(kOne) {
-    if (kOne_)
-      group_size_ = extent_y_;
-    else
-      group_size_ = extent_x_ * extent_z_;
-  }
-
-  __host__ __device__ int operator()(const int& ind) const {
-    const int group = kOne_ ? ind / group_size_ : ind % group_size_;
-    const int offset = kOne_ ? ind % group_size_ : ind / group_size_;
-
-    const int x = group / extent_z_;
-    const int z = group % extent_z_;
-
-    return x * extent_y_ * extent_z_ + z + offset * extent_z_;
-  }
-
-  int extent_x_;
-  int extent_y_;
-  int extent_z_;
-  bool kOne_;
-  int group_size_;
-};
-
-template <typename T, typename Op, typename OUT_T, typename IN_T>
-void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
-                           int in_size, Op op, T init,
-                           const cudaStream_t& cu_stream) {
-  // handle situations where low latency is important better than CUB
-  if (in_size <= 4096) {
-    const int num_blocks = 1;
-    const int num_threads = 256;
-    BlockReduceKernel<IN_T, OUT_T, num_threads>
-        <<<num_blocks, num_threads, 0, cu_stream>>>(in, out, in_size, op);
-    return;
-  } else if (in_size <= 1 << 19) {
-    const int num_threads = 256;
-    const int num_blocks = 32;  // it seems like tailoring this to the GPU
-                                // would be more effective, but all attempts
-                                // at making this a multiple of the number of
-                                // multiprocessors have lead to lower perf
-                                // in general
-                                // TODO(eriche) investigate this more
-
-    Tensor temp_storage;
-    OP_REQUIRES_OK(
-        ctx,
-        ctx->allocate_temp(
-            DT_INT8, TensorShape({static_cast<int64>(num_blocks * sizeof(T))}),
-            &temp_storage));
-
-    BlockReduceKernel<IN_T, T*, num_threads>
-        <<<num_blocks, num_threads, 0, cu_stream>>>(
-            in, (T*)temp_storage.flat<int8_t>().data(), in_size, op);
-
-    CleanupSegments<<<1, num_blocks, 0, cu_stream>>>(
-        (T*)temp_storage.flat<int8_t>().data(), out, 1, 1, num_blocks, op);
-    return;
-  }
-  std::size_t temp_storage_bytes = 0;
-
-  Tensor temp_storage;
-  // written as a loop because it reduces clutter
-  // first pass allocates memory, second launches kernel(s)
-  for (int i = 0; i < 2; ++i) {
-    auto success = cub::DeviceReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, in_size, op, init, cu_stream);
-
-    OP_REQUIRES(
-        ctx, success == 0,
-        errors::Internal("CUB reduce error", cudaGetErrorString(success)));
-
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
-}
-
-template <typename T, typename Op, typename OUT_T, typename IN_T>
-void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
-                        int num_cols, Op op, T init,
-                        const cudaStream_t& cu_stream) {
-  if (num_cols < 1024) {
-    const int threads_per_block = 128;
-    const int warps_per_block = threads_per_block / 32;
-    int num_blocks = (num_rows + warps_per_block - 1) / warps_per_block;
-
-    RowReduceKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
-        in, out, num_rows, num_cols, op);
-    return;
-  }
-
-  // setup segment offsets with counting and transform iterator
-  RowOffset row_offset_op(num_cols);
-  cub::CountingInputIterator<int> counting_iter(0);
-  cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
-      transform_iter(counting_iter, row_offset_op);
-
-  std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-  for (int i = 0; i < 2; ++i) {
-    auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, num_rows, transform_iter,
-        transform_iter + 1, op, init, cu_stream);
-
-    OP_REQUIRES(ctx, success == 0,
-                errors::Internal("CUB segmented reduce error",
-                                 cudaGetErrorString(success)));
-
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
-}
-
-template <typename T, typename Op, typename OUT_T, typename IN_T>
-void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
-                                     int extent_x, int extent_y, Op op, T init,
-                                     const cudaStream_t& cu_stream) {
-  int rows_per_warp = 32 / extent_y;
-  dim3 block_dim(32, min(Eigen::divup(extent_x, rows_per_warp), 32), 1);
-  dim3 grid_dim(1,
-                Eigen::divup(static_cast<unsigned int>(extent_x),
-                             rows_per_warp * block_dim.y),
-                1);
-
-  grid_dim.y = min((int)grid_dim.y, 32);
-
-  if (grid_dim.y > 2 && grid_dim.y < 32) {
-    int log2 = Log2Floor(grid_dim.y);
-    grid_dim.y = 1 << log2;
-  }
-
-  if (grid_dim.y == 1) {
-    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
-        in, out, extent_x, extent_y, op);
-  } else {
-    Tensor temp_storage;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_temp(DT_INT8,
-                                      TensorShape({static_cast<int64>(
-                                          sizeof(T) * extent_y * grid_dim.y)}),
-                                      &temp_storage));
-    ColumnReduceMax16ColumnsKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
-        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op);
-
-    dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
-    dim3 num_threads(128, 1, 1);
-    CleanupSegments<<<new_grid_dim, block_dim, 0, cu_stream>>>(
-        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
-        grid_dim.y, op);
-  }
-}
-
-template <typename T, typename Op, typename OUT_T, typename IN_T>
-void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
-                                       int extent_x, int extent_y, Op op,
-                                       T init, const cudaStream_t& cu_stream) {
-  dim3 block_dim(32, min(extent_x, 32), 1);
-  dim3 grid_dim((extent_y + 31) / 32, 1, 1);
-
-  if (grid_dim.x < 16) grid_dim.y = min((extent_x + 31) / 32, 32);
-
-  if (grid_dim.y > 2 && grid_dim.y < 32) {
-    int log2 = Log2Floor(grid_dim.y);
-    grid_dim.y = 1 << log2;
-  }
-
-  if (grid_dim.y == 1) {
-    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(in, out, extent_x,
-                                                              extent_y, op);
-  } else {
-    Tensor temp_storage;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_temp(DT_INT8,
-                                      TensorShape({static_cast<int64>(
-                                          sizeof(T) * extent_y * grid_dim.y)}),
-                                      &temp_storage));
-
-    ColumnReduceKernel<<<grid_dim, block_dim, 0, cu_stream>>>(
-        in, (T*)temp_storage.flat<int8_t>().data(), extent_x, extent_y, op);
-
-    dim3 new_grid_dim((grid_dim.y * extent_y + 31) / 32, 1, 1);
-    dim3 num_threads(128, 1, 1);
-    CleanupSegments<<<new_grid_dim, block_dim, 0, cu_stream>>>(
-        (T*)temp_storage.flat<int8_t>().data(), out, extent_x, extent_y,
-        grid_dim.y, op);
-  }
-}
-
-template <typename T, typename Op, typename OUT_T, typename IN_T>
-void LaunchColumnReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
-                           int extent_x, int extent_y, Op op, T init,
-                           const cudaStream_t& cu_stream) {
-  if (extent_y <= 16) {
-    LaunchColumnReduction_LTE16Cols(ctx, out, in, extent_x, extent_y, op, init,
-                                    cu_stream);
-  } else if (extent_y <= 4096) {
-    LaunchColumnReduction_LTE4096Cols(ctx, out, in, extent_x, extent_y, op,
-                                      init, cu_stream);
-  } else {
-    int threads_per_block = 128;
-    int num_blocks = Eigen::divup(extent_y, threads_per_block);
-
-    ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
-        in, out, 1, extent_x, extent_y, op);
-  }
-}
-
-template <typename T, typename Op, typename OUT_T, typename IN_T>
-void Launch3DYReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
-                        int extent_y, int extent_z, Op op, T init,
-                        const cudaStream_t& cu_stream) {
-  int threads_per_block = 128;
-  int num_blocks =
-      (extent_x * extent_z + threads_per_block - 1) / threads_per_block;
-
-  // TODO (eriche): this won't be very good in the case of small x
-  //                small z and large y.
-  ColumnReduceSimpleKernel<<<num_blocks, threads_per_block, 0, cu_stream>>>(
-      in, out, extent_x, extent_y, extent_z, op);
-}
-
-template <typename T, typename Op, typename OUT_T, typename IN_T>
-void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
-                         int extent_y, int extent_z, Op op, T init,
-                         const cudaStream_t& cu_stream) {
-  // setup segment offsets with counting and transform iterator
-  RowOffset row_offset_op(extent_x * extent_z);
-  cub::CountingInputIterator<int> counting_iter(0);
-  cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
-      transform_iter(counting_iter, row_offset_op);
-
-  GatherOp gather_op(extent_x, extent_y, extent_z, false);
-  typedef cub::TransformInputIterator<int, GatherOp,
-                                      cub::CountingInputIterator<int>>
-      gatherIterType;
-  gatherIterType gather_iter(counting_iter, gather_op);
-
-  PermutationInputIterator<T, IN_T, gatherIterType> permute_iter(in,
-                                                                 gather_iter);
-
-  std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-
-  for (int i = 0; i < 2; ++i) {
-    auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, permute_iter, out, extent_y, transform_iter,
-        transform_iter + 1, op, init, cu_stream);
-
-    OP_REQUIRES(ctx, success == 0,
-                errors::Internal("CUB segmented reduce error",
-                                 cudaGetErrorString(success)));
-
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
-}
-
-template <typename T, typename Op, typename OUT_T, typename IN_T,
-          typename ReductionAxes>
-void ReduceImpl(OpKernelContext* ctx, OUT_T out, IN_T in, int in_rank,
-                int in_dim0, int in_dim1, int in_dim2, int out_rank,
-                const ReductionAxes& reduction_axes, Op op, T init) {
-  const cudaStream_t& cu_stream = GetCudaStream(ctx);
-  if (out_rank == 0) {
-    const int in_size = in_dim0 * in_dim1 * in_dim2;
-    LaunchScalarReduction(ctx, out, in, in_size, op, init, cu_stream);
-  } else if (in_rank == 2 && out_rank == 1 &&
-             reduction_axes[0] == 1) {  // row reduction
-    LaunchRowReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
-  } else if (in_rank == 2 && out_rank == 1 &&
-             reduction_axes[0] == 0) {  // column reduction
-    LaunchColumnReduction(ctx, out, in, in_dim0, in_dim1, op, init, cu_stream);
-  } else if (in_rank == 3 && out_rank == 2 && reduction_axes[0] == 1) {
-    Launch3DYReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
-                       cu_stream);
-  } else if (in_rank == 3 && out_rank == 1 && reduction_axes[0] == 0 &&
-             reduction_axes[1] == 2) {
-    Launch3DXZReduction(ctx, out, in, in_dim0, in_dim1, in_dim2, op, init,
-                        cu_stream);
-  } else {
-    std::stringstream ss;
-    ss << "Invalid reduction requested: in_rank, out_rank, axes " << in_rank
-       << " " << out_rank;
-    if (out_rank == 1) ss << " " << reduction_axes[0];
-    if (out_rank == 2) ss << " " << reduction_axes[1];
-    LOG(FATAL) << ss.str();
-  }
-}
-
-}  // namespace functor
-}  // namespace tensorflow
-
-#endif
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index 9bbe993a2f9..9cdebdd4f23 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -23,59 +22,14 @@ namespace tensorflow {
 
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
-template <typename T>
-static Graph* ToScalar(const string& reduce, int num_x, int num_y) {
-  auto* g = new Graph(OpRegistry::Global());
-  Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x, num_y}));
-  data.flat<T>().setRandom();
-  Tensor axes(DT_INT32, TensorShape({2}));
+static Graph* ToScalar(const string& reduce, int num) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({3}));
   axes.flat<int32>()(0) = 0;
   axes.flat<int32>()(1) = 1;
-  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
-                      test::graph::Constant(g, axes));
-  return g;
-}
-
-static Graph* ColReduce(const string& reduce, int num_x, int num_y) {
-  auto* g = new Graph(OpRegistry::Global());
-  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
-  data.flat<float>().setRandom();
-  Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 0;
-  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
-                      test::graph::Constant(g, axes));
-  return g;
-}
-
-static Graph* RowReduce(const string& reduce, int num_x, int num_y) {
-  auto* g = new Graph(OpRegistry::Global());
-  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
-  data.flat<float>().setRandom();
-  Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 1;
-  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
-                      test::graph::Constant(g, axes));
-  return g;
-}
-
-static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) {
-  auto* g = new Graph(OpRegistry::Global());
-  Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
-  data.flat<float>().setRandom();
-  Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 1;
-  test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
-                      test::graph::Constant(g, axes));
-  return g;
-}
-
-static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
-  auto* g = new Graph(OpRegistry::Global());
-  Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
-  data.flat<float>().setRandom();
-  Tensor axes(DT_INT32, TensorShape({2}));
-  axes.flat<int32>()(0) = 0;
-  axes.flat<int32>()(1) = 2;
+  axes.flat<int32>()(2) = 2;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
@@ -83,100 +37,51 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
 
 // Creates a bench which reduces a 3D tensor with total "num" floats
 // into a scalar on a "device". Runs the bench for "iters" times.
-template <typename T>
 static void ReduceToScalar(int iters, const string& device,
-                           const string& reduce, int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(T));
-  test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
+                           const string& reduce, int num) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num);
+  testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float));
+  test::Benchmark(device, ToScalar(reduce, num)).Run(iters);
 }
 
-static void DoRowReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
+static void BM_Sum3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Sum", num);
 }
+BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void DoColReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
+static void BM_Max3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Max", num);
 }
+BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void Do3DYReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
+static void BM_Prod3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Prod", num);
 }
+BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void Do3DXZReduce(int iters, const string& device, const string& reduce,
-                         int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
+static void BM_Mean3DToScalarCPU(int iters, int num) {
+  ReduceToScalar(iters, "cpu", "Mean", num);
 }
+BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum3DToScalarGPU(int iters, int num) {
+  ReduceToScalar(iters, "gpu", "Sum", num);
 }
-BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);
+BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
-  ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Max3DToScalarGPU(int iters, int num) {
+  ReduceToScalar(iters, "gpu", "Max", num);
 }
-BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);
+BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
-  ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Prod3DToScalarGPU(int iters, int num) {
+  ReduceToScalar(iters, "gpu", "Prod", num);
 }
-BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);
+BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20);
 
-static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
-  DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Mean3DToScalarGPU(int iters, int num) {
+  ReduceToScalar(iters, "gpu", "Mean", num);
 }
-BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);
-
-static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
-  DoColReduce(iters, "gpu", "Sum", num_x, num_y);
-}
-BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);
-
-static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
-  Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
-}
-BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);
-
-static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
-  Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
-}
-BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);
-
-static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
-}
-BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
-
-static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
-}
-BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
-
-static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
-}
-BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
-
-static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
-}
-BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h
deleted file mode 100644
index f6375b25157..00000000000
--- a/tensorflow/core/util/permutation_input_iterator.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
-#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
-
-#include <iostream>
-#include <iterator>
-
-namespace tensorflow {
-
-template <typename ValueType, typename InputIteratorT, typename IndexIteratorT,
-          typename OffsetT = ptrdiff_t>
-class PermutationInputIterator {
- public:
-  // Required iterator traits
-  typedef PermutationInputIterator self_type;  ///< My own type
-  typedef OffsetT difference_type;  ///< Type to express the result of
-                                    ///< subtracting one iterator from another
-  typedef ValueType
-      value_type;  ///< The type of the element the iterator can point to
-  typedef ValueType* pointer;   ///< The type of a pointer to an element the
-                                ///< iterator can point to
-  typedef ValueType reference;  ///< The type of a reference to an element the
-                                ///< iterator can point to
-
-  typedef std::random_access_iterator_tag
-      iterator_category;  ///< The iterator category
-
- private:
-  InputIteratorT input_itr;
-  IndexIteratorT index_itr;
-
- public:
-  /// Constructor
-  __host__ __device__ __forceinline__ PermutationInputIterator(
-      InputIteratorT input_itr,  ///< Input iterator to wrap
-      IndexIteratorT index_itr)  ///< Conversion functor to wrap
-      : input_itr(input_itr), index_itr(index_itr) {}
-
-  /// Postfix increment
-  __host__ __device__ __forceinline__ self_type operator++(int) {
-    self_type retval = *this;
-    index_itr++;
-    return retval;
-  }
-
-  /// Prefix increment
-  __host__ __device__ __forceinline__ self_type operator++() {
-    index_itr++;
-    return *this;
-  }
-
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*() const {
-    return input_itr[*index_itr];
-  }
-
-  /// Addition
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
-    self_type retval(input_itr, index_itr + n);
-    return retval;
-  }
-
-  /// Addition assignment
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
-    index_itr += n;
-    return *this;
-  }
-
-  /// Subtraction
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
-    self_type retval(input_itr, index_itr - n);
-    return retval;
-  }
-
-  /// Subtraction assignment
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
-    index_itr -= n;
-    return *this;
-  }
-
-  /// Distance
-  __host__ __device__ __forceinline__ difference_type
-  operator-(self_type other) const {
-    return index_itr - other.index_itr;
-  }
-
-  /// Array subscript
-  template <typename Distance>
-  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
-    return input_itr[index_itr[n]];
-  }
-
-  /// Structure dereference
-  __host__ __device__ __forceinline__ pointer operator->() {
-    return input_itr + *index_itr;
-  }
-
-  /// Equal to
-  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
-    return (index_itr == rhs.index_itr && input_itr == rhs.input_itr);
-  }
-
-  /// Not equal to
-  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
-    return !(*this == rhs);
-  }
-
-  /// ostream operator
-  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
-    return os;
-  }
-};
-
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/transform_output_iterator.h b/tensorflow/core/util/transform_output_iterator.h
deleted file mode 100644
index 1640791ad17..00000000000
--- a/tensorflow/core/util/transform_output_iterator.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
-#define TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
-
-#include <iostream>
-#include <iterator>
-
-namespace tensorflow {
-
-template <typename StoreType, typename InputType, typename ConversionOp,
-          typename OffsetT = ptrdiff_t>
-class TransformOutputIterator {
- private:
-  // Proxy object
-  struct Reference {
-    StoreType* ptr;
-    ConversionOp conversion_op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ Reference(StoreType* ptr,
-                                                  ConversionOp conversion_op)
-        : ptr(ptr), conversion_op(conversion_op) {}
-
-    /// Assignment
-    __host__ __device__ __forceinline__ InputType operator=(InputType val) {
-      *ptr = conversion_op(val);
-      return val;
-    }
-  };
-
- public:
-  // Required iterator traits
-  typedef TransformOutputIterator self_type;  ///< My own type
-  typedef OffsetT difference_type;            ///< Type to express the result of
-                                    ///< subtracting one iterator from another
-  typedef void
-      value_type;        ///< The type of the element the iterator can point to
-  typedef void pointer;  ///< The type of a pointer to an element the iterator
-                         ///< can point to
-  typedef Reference reference;  ///< The type of a reference to an element the
-                                ///< iterator can point to
-
-  typedef std::random_access_iterator_tag
-      iterator_category;  ///< The iterator category
-
-  /*private:*/
-
-  StoreType* ptr;
-  ConversionOp conversion_op;
-
- public:
-  /// Constructor
-  template <typename QualifiedStoreType>
-  __host__ __device__ __forceinline__ TransformOutputIterator(
-      QualifiedStoreType* ptr,
-      ConversionOp conversionOp)  ///< Native pointer to wrap
-      : ptr(ptr), conversion_op(conversionOp) {}
-
-  /// Postfix increment
-  __host__ __device__ __forceinline__ self_type operator++(int) {
-    self_type retval = *this;
-    ptr++;
-    return retval;
-  }
-
-  /// Prefix increment
-  __host__ __device__ __forceinline__ self_type operator++() {
-    ptr++;
-    return *this;
-  }
-
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*() const {
-    return Reference(ptr, conversion_op);
-  }
-
-  /// Addition
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
-    self_type retval(ptr + n, conversion_op);
-    return retval;
-  }
-
-  /// Addition assignment
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
-    ptr += n;
-    return *this;
-  }
-
-  /// Subtraction
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
-    self_type retval(ptr - n, conversion_op);
-    return retval;
-  }
-
-  /// Subtraction assignment
-  template <typename Distance>
-  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
-    ptr -= n;
-    return *this;
-  }
-
-  /// Distance
-  __host__ __device__ __forceinline__ difference_type
-  operator-(self_type other) const {
-    return ptr - other.ptr;
-  }
-
-  /// Array subscript
-  template <typename Distance>
-  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
-    return Reference(ptr + n, conversion_op);
-  }
-
-  /// Equal to
-  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
-    return (ptr == rhs.ptr);
-  }
-
-  /// Not equal to
-  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
-    return (ptr != rhs.ptr);
-  }
-
-  /// ostream operator
-  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
-    return os;
-  }
-};
-
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_UTIL_TRANSFORM_OUTPUT_ITERATOR_H_
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index b6daad3ddfe..797112b5381 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1679,26 +1679,6 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
-cuda_py_test(
-    name = "reduction_ops_test_big",
-    size = "medium",
-    srcs = ["reduction_ops_test_big.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-    ],
-    tags = [
-        "manual",
-        "no_gpu",
-        "nogpu",
-        "noguitar",
-        "notap",
-    ],
-)
-
 cuda_py_test(
     name = "relu_op_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index b98a04d72cc..04ce99a4a63 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -175,24 +175,6 @@ class SumReductionTest(BaseReductionTest):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
-  def testFloat16(self):
-    for rank in range(1, _MAX_RANK + 1):
-      np_arr = self._makeIncremental((2,) * rank, dtypes.float16)
-      self._compareAllAxes(np_arr)
-
-    # test that mean doesn't overflow
-    # only on GPU, since it has the more accurate implementation
-    if not test.is_gpu_available():
-      return
-
-    arr = np.ones([68000], dtype=np.float16)
-
-    with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-      tf_arr = array_ops.constant(arr)
-      tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
-      tf_out_mean = sess.run(tf_mean)
-    self.assertAllClose(tf_out_mean, 1.)
-
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
deleted file mode 100644
index 99fea62f98f..00000000000
--- a/tensorflow/python/kernel_tests/reduction_ops_test_big.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional tests for reduction ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class BaseReductionTest(test.TestCase):
-
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    raise NotImplementedError()
-
-
-class SumReductionTest(BaseReductionTest):
-
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
-
-  def testFloat32(self):
-    # make sure we test all possible kernel invocations
-    # logic is the same for all ops, test just float32 for brevity
-    for size_x in range(1, 4105, 27):
-      for size_y in range(1, 4105, 27):
-        arr = np.ones([size_x, size_y], dtype=np.float32)
-        col_sum = np.ones([size_y], dtype=np.float32) * size_x
-        row_sum = np.ones([size_x], dtype=np.float32) * size_y
-        full_sum = np.ones([], dtype=np.float32) * size_x * size_y
-
-        with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce(arr, 1, False)
-          tf_col_sum = self._tf_reduce(arr, 0, False)
-          tf_full_sum = self._tf_reduce(arr, [0, 1], False)
-          tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_sum, tf_col_sum, tf_full_sum])
-        self.assertAllClose(col_sum, tf_out_col)
-        self.assertAllClose(row_sum, tf_out_row)
-        self.assertAllClose(full_sum, tf_out_full)
-
-    for size_x in range(1, 130, 3):
-      for size_y in range(1, 130, 3):
-        for size_z in range(1, 130, 3):
-          arr = np.ones([size_x, size_y, size_z], dtype=np.float32)
-          sum_y = np.sum(arr, axis=1)
-          sum_xz = np.sum(arr, axis=(0, 2))
-
-          with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
-          self.assertAllClose(sum_y, tf_out_sum_y)
-          self.assertAllClose(sum_xz, tf_out_sum_xz)
-
-
-if __name__ == "__main__":
-  test.main()

From 16001fc526831c7a7f1a3814f517b01008df4c4c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 21 Aug 2017 09:25:57 -0700
Subject: [PATCH 63/70] Implement gradients for point-wise Minimum and Maximum
 ops.

PiperOrigin-RevId: 165936734
---
 tensorflow/cc/gradients/math_grad.cc      | 36 +++++++++++++++++++++++
 tensorflow/cc/gradients/math_grad_test.cc | 31 +++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 6e4b5c6ef14..4ef2dc5bfa3 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -481,6 +481,42 @@ Status AddNGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("AddN", AddNGrad);
 
+// MaximumMinimumGradCommon adds shared ops to calculate gradients for
+// the binary Maximum and Minimum ops.
+Status MaximumMinimumGradCommon(const Scope& scope, const Operation& op,
+                                const std::vector<Output>& grad_inputs,
+                                std::vector<Output>* grad_outputs,
+                                const Output& comparator) {
+  // comparator is a boolean tensor, with
+  // y = x_1 at points where comparator is true, and x_2 otherwise
+  // Therefore
+  // dy/dx_1 = 1 where comparator is true, and 0 otherwise.
+  // dy/dx_2 = 0 where comparator is true, and 1 otherwise.
+  auto grad = grad_inputs[0];
+  auto zeros = ZerosLike(scope, grad);
+  auto gx_1 = Where3(scope, comparator, grad, zeros);
+  auto gx_2 = Where3(scope, LogicalNot(scope, comparator), grad, zeros);
+  return BinaryGradCommon(scope, op, grad_outputs, gx_1, gx_2);
+}
+
+Status MaximumGrad(const Scope& scope, const Operation& op,
+                   const std::vector<Output>& grad_inputs,
+                   std::vector<Output>* grad_outputs) {
+  auto comparator = GreaterEqual(scope, op.input(0), op.input(1));
+  return MaximumMinimumGradCommon(scope, op, grad_inputs, grad_outputs,
+                                  comparator);
+}
+REGISTER_GRADIENT_OP("Maximum", MaximumGrad);
+
+Status MinimumGrad(const Scope& scope, const Operation& op,
+                   const std::vector<Output>& grad_inputs,
+                   std::vector<Output>* grad_outputs) {
+  auto comparator = LessEqual(scope, op.input(0), op.input(1));
+  return MaximumMinimumGradCommon(scope, op, grad_inputs, grad_outputs,
+                                  comparator);
+}
+REGISTER_GRADIENT_OP("Minimum", MinimumGrad);
+
 Status RealGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 5e4c991b4b2..011e1193537 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -911,6 +911,15 @@ class NaryGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
+  void RunTest(const Output& x, const Tensor& x_init_value, const Output& y,
+               const TensorShape& y_shape) {
+    TF_ASSERT_OK(scope_.status());
+    float max_error;
+    TF_ASSERT_OK(
+        ComputeGradientError(scope_, x, x_init_value, y, y_shape, &max_error));
+    EXPECT_LT(max_error, 1e-3);
+  }
+
   Scope scope_;
 };
 
@@ -979,5 +988,27 @@ TEST_F(NaryGradTest, SquaredDifference) {
   RunTest({x1, x2}, {x1_shape, x2_shape}, {y}, {x1_shape});
 }
 
+TEST_F(NaryGradTest, Maximum) {
+  TensorShape shape({3, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Maximum(scope_, x, Const(scope_, 1.0f));
+  // Select values away from 1.0f to avoid instability when computing
+  // finite differences.
+  Tensor x_init_value =
+      test::AsTensor<float>({0.5f, 1.5f, -1.2f, 3.0f, 0.1f, 2.8f}, {3, 2});
+  RunTest(x, x_init_value, y, shape);
+}
+
+TEST_F(NaryGradTest, Minimum) {
+  TensorShape shape({3, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Minimum(scope_, x, Const(scope_, 1.0f));
+  // Select values away from 1.0f to avoid instability when computing
+  // finite differences.
+  Tensor x_init_value =
+      test::AsTensor<float>({0.5f, 1.5f, -1.2f, 3.0f, 0.1f, 2.8f}, {3, 2});
+  RunTest(x, x_init_value, y, shape);
+}
+
 }  // namespace
 }  // namespace tensorflow

From 77b5f6a956c61e6ada9d4a6c61892f9bd2464fdb Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Mon, 21 Aug 2017 09:34:24 -0700
Subject: [PATCH 64/70] Add ability for constant_op to accept encoded Variant
 tensors.

Also added DebugString() to Variant types and Tensor.DebugString() support for
tensors containing Variants.

Also fixed a mutex bug in Variant is_empty().

PiperOrigin-RevId: 165937782
---
 tensorflow/core/framework/tensor.cc           |  4 ++
 tensorflow/core/framework/variant.cc          |  4 +-
 tensorflow/core/framework/variant.h           | 46 ++++++++++++-----
 .../core/framework/variant_encode_decode.h    | 51 +++++++++++++++++++
 tensorflow/core/framework/variant_test.cc     |  8 +++
 tensorflow/core/kernels/constant_op.cc        |  7 +++
 tensorflow/python/framework/tensor_util.py    | 10 +++-
 .../python/kernel_tests/constant_op_test.py   | 25 +++++++++
 8 files changed, 138 insertions(+), 17 deletions(-)

diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 7d36bdbc8a6..88703c331c6 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -948,6 +948,10 @@ string Tensor::SummarizeValue(int64 max_entries) const {
           case DT_STRING:
             strings::StrAppend(&ret, str_util::CEscape(flat<string>()(i)));
             break;
+          case DT_VARIANT: {
+            const Variant& v = flat<Variant>()(i);
+            strings::StrAppend(&ret, v.DebugString());
+          } break;
           default:
             // TODO(zhifengc, josh11b): Pretty-print other types (bool,
             // complex64, quantized).
diff --git a/tensorflow/core/framework/variant.cc b/tensorflow/core/framework/variant.cc
index 1d866f5df19..cc6178e90d5 100644
--- a/tensorflow/core/framework/variant.cc
+++ b/tensorflow/core/framework/variant.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 template <>
 void* Variant::MaybeDecodeAndGet() {
   mutex_lock lock(mu_);
-  if (is_empty()) {
+  if (IsEmptyLocked()) {
     return nullptr;
   }
   return value_->RawPtr();
@@ -33,7 +33,7 @@ void* Variant::MaybeDecodeAndGet() {
 template <>
 const void* Variant::MaybeDecodeAndGet() const {
   mutex_lock lock(mu_);
-  if (is_empty()) {
+  if (IsEmptyLocked()) {
     return nullptr;
   }
   return value_->RawPtr();
diff --git a/tensorflow/core/framework/variant.h b/tensorflow/core/framework/variant.h
index dca321a4e9f..e00aec7b168 100644
--- a/tensorflow/core/framework/variant.h
+++ b/tensorflow/core/framework/variant.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
@@ -33,6 +34,9 @@ namespace tensorflow {
 template <typename T>
 string TypeNameVariant(const T& value);
 
+template <typename T>
+string DebugStringVariant(const T& value);
+
 template <typename T>
 void EncodeVariant(const T& value, VariantTensorData* data);
 
@@ -154,10 +158,10 @@ class Variant {
   Variant() noexcept {}
 
   Variant(const Variant& other) {
-    if (other.is_empty()) {
+    mutex_lock other_lock(other.mu_);
+    if (other.IsEmptyLocked()) {
       value_ = std::unique_ptr<ValueInterface>();
     } else {
-      mutex_lock other_lock(other.mu_);
       value_ = other.value_->Clone();
     }
   }
@@ -187,14 +191,17 @@ class Variant {
     return *this;
   }
 
-  bool is_empty() const { return value_ == nullptr; }
+  bool is_empty() const {
+    mutex_lock lock(mu_);
+    return IsEmptyLocked();
+  }
 
   void clear() noexcept {
     mutex_lock lock(mu_);
     value_.reset();
   }
 
-  void swap(Variant& other) noexcept {
+  void swap(Variant& other) noexcept NO_THREAD_SAFETY_ANALYSIS {
     if (this == &other) return;
     mutex_lock lock0(this < &other ? mu_ : other.mu_);
     mutex_lock lock1(this < &other ? other.mu_ : mu_);
@@ -209,6 +216,12 @@ class Variant {
     return TypeIdLocked();
   }
 
+  string DebugString() const {
+    mutex_lock lock(mu_);
+    return strings::StrCat("Variant<type: ", TypeNameLocked(),
+                           " value: ", value_->DebugString(), ">");
+  }
+
   // Returns a pointer to the stored value if it is type T, or nullptr
   // otherwise.
   //
@@ -247,11 +260,11 @@ class Variant {
         "possibility of race conditions when other threads call a "
         "mutating MaybeDecodeAndGet<ORIGINAL_TYPE>.  Please access the the "
         "value via MaybeDecodeAndGet<ORIGINAL_TYPE>.");
-    if (is_empty()) {
+    mutex_lock lock(mu_);
+    if (IsEmptyLocked()) {
       return nullptr;
     }
     const TypeIndex TTypeIndex = MakeTypeIndex<T>();
-    mutex_lock lock(mu_);
     if (TTypeIndex != TypeIdLocked()) {
       CHECK(TypeIdLocked() != MakeTypeIndex<VariantTensorDataProto>())
           << ": Cannot call MaybeDecodeAndGet on const Variant holding "
@@ -276,7 +289,7 @@ class Variant {
   // Serialize the contents of the stored object into `data`.
   void Encode(VariantTensorData* data) const {
     mutex_lock lock(mu_);
-    if (!is_empty()) {
+    if (!IsEmptyLocked()) {
       value_->Encode(data);
     }
   }
@@ -290,22 +303,26 @@ class Variant {
   // Helper methods to directly serialize/deserialize from strings.
   void Encode(string* buf) const {
     mutex_lock lock(mu_);
-    if (!is_empty()) {
+    if (!IsEmptyLocked()) {
       value_->Encode(buf);
     }
   }
   bool Decode(const string& buf) {
     mutex_lock lock(mu_);
-    if (!is_empty()) {
+    if (!IsEmptyLocked()) {
       return value_->Decode(buf);
     }
     return true;
   }
 
  private:
+  bool IsEmptyLocked() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return value_ == nullptr;
+  }
+
   TypeIndex TypeIdLocked() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     const TypeIndex VoidTypeIndex = MakeTypeIndex<void>();
-    if (is_empty()) {
+    if (IsEmptyLocked()) {
       return VoidTypeIndex;
     }
     return value_->TypeId();
@@ -313,7 +330,7 @@ class Variant {
 
   template <typename T, typename VT = typename std::decay<T>::type>
   T* GetLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (is_empty()) {
+    if (IsEmptyLocked()) {
       return nullptr;
     }
     const TypeIndex TTypeIndex = MakeTypeIndex<T>();
@@ -328,7 +345,7 @@ class Variant {
   }
 
   string TypeNameLocked() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (is_empty()) {
+    if (IsEmptyLocked()) {
       return "";
     }
     return value_->TypeName();
@@ -336,7 +353,7 @@ class Variant {
 
   bool DecodeLocked(const VariantTensorData& data)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (!is_empty()) {
+    if (!IsEmptyLocked()) {
       return value_->Decode(data);
     }
     return true;
@@ -364,6 +381,7 @@ class Variant {
     virtual const void* RawPtr() const = 0;
     virtual std::unique_ptr<ValueInterface> Clone() const = 0;
     virtual string TypeName() const = 0;
+    virtual string DebugString() const = 0;
     virtual void Encode(VariantTensorData* data) const = 0;
     virtual bool Decode(const VariantTensorData& data) = 0;
     virtual void Encode(string* buf) const = 0;
@@ -392,6 +410,8 @@ class Variant {
 
     string TypeName() const override { return TypeNameVariant(value); }
 
+    string DebugString() const override { return DebugStringVariant(value); }
+
     void Encode(VariantTensorData* data) const override {
       EncodeVariant(value, data);
     }
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 80aeb5b2dad..9eae8969e16 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -164,6 +165,56 @@ string TypeNameVariant(const T& value) {
   return TypeNameVariantImpl(value, TypeNameResolver<T>());
 }
 
+template <typename C, typename = void>
+struct has_debug_string : std::false_type {};
+
+template <typename C>
+struct has_debug_string<
+    C, typename std::enable_if<std::is_same<
+           decltype(std::declval<C>().DebugString()), string>::value>::type>
+    : std::true_type {};
+
+template <typename C, typename = void>
+struct can_strcat : std::false_type {};
+
+template <typename C>
+struct can_strcat<
+    C, typename std::enable_if<std::is_same<
+           decltype(strings::StrCat(std::declval<C>())), string>::value>::type>
+    : std::true_type {};
+
+template <typename T,
+          bool = has_debug_string<typename std::decay<T>::type>::value,
+          bool = can_strcat<typename std::decay<T>::type>::value>
+struct DebugStringResolver {};
+
+// TODO(ebrevdo): Expand DebugStringResolver to return TypeString if
+// there is no StrCat<T>() constructor.
+template <typename T>
+string DebugStringVariantImpl(
+    const T& value, DebugStringResolver<T, true /* has_debug_string */>) {
+  return value.DebugString();
+}
+
+template <typename T>
+string DebugStringVariantImpl(
+    const T& value, DebugStringResolver<T, false /* has_debug_string */,
+                                        true /* can_strcat */>) {
+  return strings::StrCat(value);
+}
+
+template <typename T>
+string DebugStringVariantImpl(
+    const T& value, DebugStringResolver<T, false /* has_debug_string */,
+                                        false /* can_strcat */>) {
+  return "?";
+}
+
+template <typename T>
+string DebugStringVariant(const T& value) {
+  return DebugStringVariantImpl(value, DebugStringResolver<T>());
+}
+
 template <typename T>
 void EncodeVariant(const T& value, VariantTensorData* data) {
   EncodeVariantImpl(value, TypeResolver<T>(), data);
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 773eb66dad4..7d92c2d7725 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -180,6 +180,7 @@ TEST(VariantTest, TensorListTest) {
   x = vec;
 
   EXPECT_EQ(x.TypeName(), "TensorList");
+  EXPECT_EQ(x.DebugString(), "Variant<type: TensorList value: ?>");
   const TensorList& stored_vec = *x.MaybeDecodeAndGet<TensorList>();
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(stored_vec.vec[i].flat<int>()(0), i);
@@ -207,6 +208,9 @@ TEST(VariantTest, TensorListTest) {
   Variant y_unknown = data;
   EXPECT_EQ(y_unknown.TypeName(), "TensorList");
   EXPECT_EQ(y_unknown.TypeId(), MakeTypeIndex<VariantTensorDataProto>());
+  EXPECT_EQ(y_unknown.DebugString(),
+            strings::StrCat(
+                "Variant<type: TensorList value: ", data.DebugString(), ">"));
 
   // Now call a get that internally performs a decode.
   const TensorList& unknown_decoded_vec =
@@ -243,6 +247,7 @@ TEST(VariantTest, PodUpdate) {
   Variant x = Pod{10, 20.f};
   EXPECT_NE(x.MaybeDecodeAndGet<Pod>(), nullptr);
   EXPECT_EQ(x.TypeName(), "POD");
+  EXPECT_EQ(x.DebugString(), "Variant<type: POD value: ?>");
 
   x.MaybeDecodeAndGet<Pod>()->x += x.MaybeDecodeAndGet<Pod>()->y;
   EXPECT_EQ(x.MaybeDecodeAndGet<Pod>()->x, 30);
@@ -282,6 +287,9 @@ TEST(VariantTest, EncodeDecodeTensor) {
 
   Variant y = Tensor();
   y.Decode(serialized);
+  EXPECT_EQ(y.DebugString(),
+            "Variant<type: tensorflow::Tensor value: Tensor<type: int32 shape: "
+            "[] values: 42>>");
   EXPECT_EQ(x.MaybeDecodeAndGet<Tensor>()->flat<int>()(0),
             y.MaybeDecodeAndGet<Tensor>()->flat<int>()(0));
 }
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 68e960d6b75..f0052619f0c 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -73,6 +73,13 @@ REGISTER_KERNEL(GPU, int64);
 REGISTER_KERNEL(GPU, complex64);
 REGISTER_KERNEL(GPU, complex128);
 REGISTER_KERNEL(GPU, bool);
+// TODO(ebrevdo): Add callbacks based on Variant TypeName for
+// Variant tensors in rendezvous.  At that point, MakeTensorFromProto() will
+// work correctly and so will Variant _Send/_Recv calls; and we will
+// no longer have to mark Variant inputs/outputs as sitting on host in
+// kernel registrations.  Then we can uncomment this registration.
+// REGISTER_KERNEL(GPU, Variant);
+
 // Currently we do not support string constants on GPU
 #undef REGISTER_KERNEL
 #endif
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 3d0308e2ac7..3bdc7214028 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -313,10 +313,13 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     verify_shape:   Boolean that enables verification of a shape of values.
 
   Returns:
-    A TensorProto. Depending on the type, it may contain data in the
+    A `TensorProto`. Depending on the type, it may contain data in the
     "tensor_content" attribute, which is not directly useful to Python programs.
     To access the values you should convert the proto back to a numpy ndarray
-    with tensor_util.MakeNdarray(proto).
+    with `tensor_util.MakeNdarray(proto)`.
+
+    If `values` is a `TensorProto`, it is immediately returned; `dtype` and
+    `shape` are ignored.
 
   Raises:
     TypeError:  if unsupported types are provided.
@@ -344,6 +347,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   can not have more elements than what "shape" specifies.
 
   """
+  if isinstance(values, tensor_pb2.TensorProto):
+    return values
+
   if dtype:
     dtype = dtypes.as_dtype(dtype)
 
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 0cc6745de33..2c926bf6aa4 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
@@ -108,6 +109,30 @@ class ConstantTest(test.TestCase):
             [2, 3, 5]))
     self._testCpu(np.empty((2, 0, 5)).astype(np.str_))
 
+  def testVariant(self):
+    # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
+    # copying between CPU and GPU is supported.
+    with self.test_session(use_gpu=False):
+      variant_tensor = tensor_pb2.TensorProto(
+          dtype=dtypes_lib.variant.as_datatype_enum,
+          tensor_shape=tensor_shape.TensorShape([]).as_proto(),
+          variant_val=[tensor_pb2.VariantTensorDataProto(
+              type_name=b"int",
+              metadata=np.array(1, dtype=np.int32).tobytes())])
+      const_op = constant_op.constant(variant_tensor).op
+      const_value = const_op.get_attr("value")
+
+      # Ensure we stored the tensor proto properly.
+      self.assertProtoEquals(variant_tensor, const_value)
+
+      # Smoke test -- ensure this executes without trouble.
+      # Right now, non-numpy-compatible objects cannot be returned from a
+      # session.run call; similarly, objects that can't be converted to
+      # native numpy types cannot be passed to ops.convert_to_tensor.
+      # TODO(ebrevdo): Add registration mechanism for
+      # ops.convert_to_tensor and for session.run output.
+      const_op.run()
+
   def testStringWithNulls(self):
     with self.test_session():
       val = ops.convert_to_tensor(b"\0\0\0\0").eval()

From ead1d9ba76af1fc721ed16c11812a2783d41133f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 21 Aug 2017 09:59:03 -0700
Subject: [PATCH 65/70] Make tensorflow's Notification::HasBeenNotified() cheap
 in the opensource world.

PiperOrigin-RevId: 165940897
---
 .../core/platform/default/notification.h      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/platform/default/notification.h b/tensorflow/core/platform/default/notification.h
index b21779104cf..4430d55392d 100644
--- a/tensorflow/core/platform/default/notification.h
+++ b/tensorflow/core/platform/default/notification.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_NOTIFICATION_H_
 
 #include <assert.h>
+#include <atomic>              // NOLINT
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
 
@@ -27,24 +28,23 @@ namespace tensorflow {
 
 class Notification {
  public:
-  Notification() : notified_(false) {}
+  Notification() : notified_(0) {}
   ~Notification() {}
 
   void Notify() {
     mutex_lock l(mu_);
-    assert(!notified_);
-    notified_ = true;
+    assert(!HasBeenNotified());
+    notified_.store(true, std::memory_order_release);
     cv_.notify_all();
   }
 
-  bool HasBeenNotified() {
-    mutex_lock l(mu_);
-    return notified_;
+  bool HasBeenNotified() const {
+    return notified_.load(std::memory_order_acquire);
   }
 
   void WaitForNotification() {
     mutex_lock l(mu_);
-    while (!notified_) {
+    while (!HasBeenNotified()) {
       cv_.wait(l);
     }
   }
@@ -54,16 +54,16 @@ class Notification {
                                              int64 timeout_in_us);
   bool WaitForNotificationWithTimeout(int64 timeout_in_us) {
     mutex_lock l(mu_);
-    while (!notified_ &&
+    while (!HasBeenNotified() &&
            cv_.wait_for(l, std::chrono::microseconds(timeout_in_us)) !=
                std::cv_status::timeout) {
     }
-    return notified_;
+    return HasBeenNotified();
   }
 
-  mutex mu_;
-  condition_variable cv_;
-  bool notified_;
+  mutex mu_;                    // protects mutations of notified_
+  condition_variable cv_;       // signalled when notified_ becomes non-zero
+  std::atomic<bool> notified_;  // mutations under mu_
 };
 
 inline bool WaitForNotificationWithTimeout(Notification* n,

From 4c3b145b4bd346ef761485e6529712102a890a5e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 21 Aug 2017 10:16:26 -0700
Subject: [PATCH 66/70] Introduce a reader lock into ResourceMgr::DoLookup() to
 reduce contention.

PiperOrigin-RevId: 165942827
---
 tensorflow/core/framework/resource_mgr.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index bc3ba914e09..f79eca54017 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -157,7 +157,7 @@ Status ResourceMgr::DoCreate(const string& container, TypeIndex type,
 Status ResourceMgr::DoLookup(const string& container, TypeIndex type,
                              const string& name,
                              ResourceBase** resource) const {
-  mutex_lock l(mu_);
+  tf_shared_lock l(mu_);
   const Container* b = gtl::FindPtrOrNull(containers_, container);
   if (b == nullptr) {
     return errors::NotFound("Container ", container, " does not exist.");

From c7ac6569b6eadbf876e04731d0150f36fd7ab226 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 21 Aug 2017 10:55:00 -0700
Subject: [PATCH 67/70] Updates get_global_step per deprecation warning in
 tensorflow/contrib/framework/python/ops/variables.py.

PiperOrigin-RevId: 165947794
---
 tensorflow/contrib/tensor_forest/client/random_forest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 35cba2f078a..807c8398439 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
 
 
 KEYS_NAME = 'keys'
@@ -80,7 +81,7 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
             run_context.session.graph.get_operation_by_name(
                 LOSS_NAME).outputs[0])
     return session_run_hook.SessionRunArgs(
-        {'global_step': contrib_framework.get_global_step(),
+        {'global_step': training_util.get_global_step(),
          'current_loss': loss})
 
   def after_run(self, run_context, run_values):

From c98ecd18f6e294feaf9c9e059c12dee33fc2df84 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 21 Aug 2017 11:10:16 -0700
Subject: [PATCH 68/70] Declare enum34 as dependency of pip package

enum34 is a small package, with a size of 12 kB.

PiperOrigin-RevId: 165949993
---
 tensorflow/tools/pip_package/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0e93fd27a46..2b840c1a3a5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -32,6 +32,7 @@ from setuptools.dist import Distribution
 _VERSION = '1.3.0-rc2'
 
 REQUIRED_PACKAGES = [
+    'enum34 >= 1.1.6',
     'numpy >= 1.11.0',
     'six >= 1.10.0',
     'protobuf >= 3.3.0',

From 1a21b876b15f641c787c597d0051be41fd509542 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 21 Aug 2017 11:12:45 -0700
Subject: [PATCH 69/70] Fix quantize_and_dequantize_op_test benchmark, by
 passing scalars instead of vectors for the inputs that require it.

PiperOrigin-RevId: 165950380
---
 .../kernels/quantize_and_dequantize_op_test.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 7c55958cc28..5ffcc7d65d1 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -406,15 +406,15 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
       << s;
 }
 
-#define BM_SIMPLE_QUAN_DEQUAN(DEVICE)                           \
-  static void BM_SIMPLE_QUAN_DEQUAN_##DEVICE(int iters) {       \
-    auto root = Scope::NewRootScope().ExitOnError();            \
-    ops::QuantizeAndDequantizeV2(root, {-3.5}, {-3.5}, {-3.5}); \
-    TF_CHECK_OK(root.status());                                 \
-    Graph* g = new Graph(OpRegistry::Global());                 \
-    TF_CHECK_OK(root.ToGraph(g));                               \
-    test::Benchmark(#DEVICE, g).Run(iters);                     \
-  }                                                             \
+#define BM_SIMPLE_QUAN_DEQUAN(DEVICE)                     \
+  static void BM_SIMPLE_QUAN_DEQUAN_##DEVICE(int iters) { \
+    auto root = Scope::NewRootScope().ExitOnError();      \
+    ops::QuantizeAndDequantizeV2(root, -3.5, -3.5, -3.5); \
+    TF_CHECK_OK(root.status());                           \
+    Graph* g = new Graph(OpRegistry::Global());           \
+    TF_CHECK_OK(root.ToGraph(g));                         \
+    test::Benchmark(#DEVICE, g).Run(iters);               \
+  }                                                       \
   BENCHMARK(BM_SIMPLE_QUAN_DEQUAN_##DEVICE);
 
 BM_SIMPLE_QUAN_DEQUAN(cpu);

From f50d30bb53b9f6be197ccfd80a5062bb9d63b2fa Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 21 Aug 2017 11:17:00 -0700
Subject: [PATCH 70/70] Internal changes.

PiperOrigin-RevId: 165951046
---
 tensorflow/contrib/eager/python/BUILD         |   1 +
 tensorflow/contrib/eager/python/tfe.py        |  13 ++
 tensorflow/python/eager/context.py            |  31 -----
 .../python/eager/execution_callbacks.py       | 123 ++++++++++++++++--
 4 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index cdad3e6e348..e29314099d7 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -13,6 +13,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:custom_gradient",
+        "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:function",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 1a8086cd510..aa0276dfd91 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -35,6 +35,13 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@enable_eager_execution
 
 @@custom_gradient
+
+@@add_execution_callback
+@@clear_execution_callbacks
+@@inf_callback
+@@inf_nan_callback
+@@nan_callback
+@@seterr
 """
 
 from __future__ import absolute_import
@@ -53,6 +60,12 @@ from tensorflow.python.eager.context import device
 from tensorflow.python.eager.context import enable_eager_execution
 from tensorflow.python.eager.context import run
 from tensorflow.python.eager.core import enable_tracing
+from tensorflow.python.eager.execution_callbacks import add_execution_callback
+from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
+from tensorflow.python.eager.execution_callbacks import inf_callback
+from tensorflow.python.eager.execution_callbacks import inf_nan_callback
+from tensorflow.python.eager.execution_callbacks import nan_callback
+from tensorflow.python.eager.execution_callbacks import seterr
 
 
 def list_devices():
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 35e4f1469fe..27ffdd98105 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -400,34 +400,3 @@ def enable_eager_execution():
   global _default_mode
   assert _default_mode == GRAPH_MODE
   _default_mode = EAGER_MODE
-
-
-def add_execution_callback(callback):
-  """Add an execution callback to the default eager context.
-
-  An execution callback is invoked immediately after an eager operation or
-  function has finished execution, providing access to the op's type, name
-  input and output tensors. Multiple execution callbacks can be added, in
-  which case the callbacks will be invoked in the order in which they are
-  added.
-
-  Args:
-    callback: a callable of the signature
-      `f(op_type, op_name, attrs, inputs, outputs)`.
-      `op_type` is the type of the operation that was just executed (e.g.,
-        `MatMul`).
-      `op_name` is the name of the operation that has was just executed. This
-        name is set by the client who created the operation and can be `None` if
-        it is unset.
-      `attrs` contains the attributes of the operation as a `tuple` of
-        alternating attribute name and attribute value.
-      `inputs` is the `list` of input `tfe.Tensor`(s) to the op.
-      `outputs` is the `list` of output `tfe.Tensor`(s) from the op.
-       Return value(s) from the callback are ignored.
-  """
-  get_default_context().add_post_execution_callback(callback)
-
-
-def clear_execution_callbacks():
-  """Clear all execution callbacks from the default eager context."""
-  get_default_context().clear_post_execution_callbacks()
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index fa2a054f3ca..5c81f7a9b6c 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
@@ -25,6 +27,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.platform import tf_logging as logging
 
+_DEFAULT_CALLBACK_ACTION = "raise"
+_VALID_CALLBACK_ACTIONS = (None, "ignore", "print", "raise", "warn")
+
 
 # TODO(cais): Consider moving this exception class to errors_impl.py.
 class InfOrNanError(Exception):
@@ -104,7 +109,7 @@ def inf_nan_callback(op_type,
                      outputs,
                      check_inf=True,
                      check_nan=True,
-                     action="raise"):
+                     action=_DEFAULT_CALLBACK_ACTION):
   """An execution callback that checks for `inf`s and `nan`s in output tensors.
 
   This callback can be used with `tfe.add_execute_callback` to check for invalid
@@ -128,9 +133,9 @@ def inf_nan_callback(op_type,
     check_nan: (`bool`) Whether this callback should check for `nan` values in
       the output tensor values.
     action: (`str`) Action to be taken by the callback when `inf` or `nan`
-      values are detected. Possible values {"raise", "log", "print"}
+      values are detected. Possible values {"raise", "warn", "print"}
       `"raise"`: Raise a `InfOrNanError`.
-      `"log"`: Log a warning using `tf.logging.warn`.
+      `"warn"`: Log a warning using `tf.logging.warn`.
       `"print"`: Print a message to `sys.stdout`.
 
   Raises:
@@ -170,29 +175,129 @@ def inf_nan_callback(op_type,
         error = InfOrNanError(op_type, op_name, index, len(outputs), value)
         if action == "print":
           print("Warning: %s" % str(error))
-        elif action == "log":
+        elif action == "warn":
           logging.warn(str(error))
         elif action == "raise":
           raise error
         else:
           raise ValueError(
               "Invalid action for inf_nan_callback: %s. Valid actions are: "
-              "{print | log | raise}" % action)
+              "{print | warn | raise}" % action)
 
 
-def inf_callback(op_type, op_name, attrs, inputs, outputs, action="raise"):
+def inf_callback(op_type,
+                 op_name,
+                 attrs,
+                 inputs,
+                 outputs,
+                 action=_DEFAULT_CALLBACK_ACTION):
   """A specialization of `inf_nan_callback` that checks for `inf`s only."""
   inf_nan_callback(
       op_type, op_name, attrs, inputs, outputs, check_inf=True, check_nan=False,
       action=action)
 
 
-def nan_callback(op_type, op_name, attrs, inputs, outputs, action="raise"):
+def nan_callback(op_type,
+                 op_name,
+                 attrs,
+                 inputs,
+                 outputs,
+                 action=_DEFAULT_CALLBACK_ACTION):
   """A specialization of `inf_nan_callback` that checks for `nan`s only."""
   inf_nan_callback(
       op_type, op_name, attrs, inputs, outputs, check_inf=False, check_nan=True,
       action=action)
 
 
-# TODO(cais): (b/64674139) Provide an alias, perhaps called seterr(), for
-# add_execute_callback(inf_nan_hook).
+def add_execution_callback(callback):
+  """Add an execution callback to the default eager context.
+
+  An execution callback is invoked immediately after an eager operation or
+  function has finished execution, providing access to the op's type, name
+  input and output tensors. Multiple execution callbacks can be added, in
+  which case the callbacks will be invoked in the order in which they are
+  added.
+
+  Args:
+    callback: a callable of the signature
+      `f(op_type, op_name, attrs, inputs, outputs)`.
+      `op_type` is the type of the operation that was just executed (e.g.,
+        `MatMul`).
+      `op_name` is the name of the operation that has was just executed. This
+        name is set by the client who created the operation and can be `None` if
+        it is unset.
+      `attrs` contains the attributes of the operation as a `tuple` of
+        alternating attribute name and attribute value.
+      `inputs` is the `list` of input `tfe.Tensor`(s) to the op.
+      `outputs` is the `list` of output `tfe.Tensor`(s) from the op.
+       Return value(s) from the callback are ignored.
+  """
+  context.get_default_context().add_post_execution_callback(callback)
+
+
+def clear_execution_callbacks():
+  """Clear all execution callbacks from the default eager context."""
+  context.get_default_context().clear_post_execution_callbacks()
+
+
+def seterr(inf_or_nan=None):
+  """Set how abnormal conditions are handled by the default eager context.
+
+  Example:
+  ``` python
+  tfe.seterr(inf_or_nan="raise")
+  a = tfe.Tensor(10.0)
+  b = tfe.Tensor(0.0)
+  c = a / b  # <-- Raises InfOrNanError.
+
+  tfe.seterr(inf_or_nan="ignore")
+  c = a / b  # <-- Does NOT raise exception anymore.
+  ```
+
+  Args:
+    inf_or_nan: Set action for infinity (`inf`) and NaN (`nan`) values.
+      Possible values: `{"ignore", "print", "raise", "warn"}`.
+      `"ignore"`: take no action when `inf` values appear.
+      `"print"`: print a warning to `stdout`.
+      `"raise"`: raise an `InfOrNanError`.
+      `"warn"`: print a warning using `tf.logging.warn`.
+      A value of `None` leads to no change in the action of the condition.
+
+  Returns:
+    A dictionary of old actions.
+
+  Raises:
+    ValueError: If the value of any keyword arguments is invalid.
+  """
+  if inf_or_nan not in _VALID_CALLBACK_ACTIONS:
+    raise ValueError(
+        "Invalid action value for inf_or_nan: %s. "
+        "Valid actions are %s." % (inf_or_nan, _VALID_CALLBACK_ACTIONS))
+
+  old_settings = {"inf_or_nan": "ignore"}
+  default_context = context.get_default_context()
+
+  carryover_callbacks = []
+  for callback in default_context.post_execution_callbacks:
+    # Check whether the callback is inf_nan_callback or a partial object of
+    # inf_nan_callback.
+    if (callback == inf_nan_callback or
+        isinstance(callback, functools.partial) and
+        callback.func == inf_nan_callback):
+      if callback == inf_nan_callback:
+        old_settings["inf_or_nan"] = _DEFAULT_CALLBACK_ACTION
+      else:
+        old_settings["inf_or_nan"] = callback.keywords.get(
+            "action", _DEFAULT_CALLBACK_ACTION)
+    elif inf_or_nan is not None:
+      carryover_callbacks.append(callback)
+
+  if inf_or_nan is not None:
+    default_context.clear_post_execution_callbacks()
+    for callback in carryover_callbacks:
+      default_context.add_post_execution_callback(callback)
+    if inf_or_nan != "ignore":
+      default_context.add_post_execution_callback(
+          functools.partial(inf_nan_callback, action=inf_or_nan))
+
+  return old_settings