From ff88f4f51577c52bf3eb208837a674a45d7a92c9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 24 Jul 2017 17:37:55 -0700
Subject: [PATCH 01/56] Update ops-related pbtxt files.

PiperOrigin-RevId: 163014080
---
 tensorflow/core/ops/ops.pbtxt | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1822fc1133d..ab984d2547f 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -499,7 +499,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -601,7 +601,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -1691,7 +1691,7 @@ op {
   }
   input_arg {
     name: "dimension"
-    description: "int32 or int64, 0 <= dimension < rank(input).  Describes\nwhich dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
+    description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
     type_attr: "Tidx"
   }
   output_arg {
@@ -1757,7 +1757,7 @@ op {
   }
   input_arg {
     name: "dimension"
-    description: "int32 or int64, 0 <= dimension < rank(input).  Describes\nwhich dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
+    description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
     type_attr: "Tidx"
   }
   output_arg {
@@ -5656,10 +5656,12 @@ op {
   name: "Cumprod"
   input_arg {
     name: "x"
+    description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`."
     type_attr: "T"
   }
   input_arg {
     name: "axis"
+    description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -5672,6 +5674,7 @@ op {
     default_value {
       b: false
     }
+    description: "If `True`, perform exclusive cumprod."
   }
   attr {
     name: "reverse"
@@ -5679,6 +5682,7 @@ op {
     default_value {
       b: false
     }
+    description: "A `bool` (default: False)."
   }
   attr {
     name: "T"
@@ -5722,10 +5726,12 @@ op {
   name: "Cumsum"
   input_arg {
     name: "x"
+    description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`."
     type_attr: "T"
   }
   input_arg {
     name: "axis"
+    description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -5738,6 +5744,7 @@ op {
     default_value {
       b: false
     }
+    description: "If `True`, perform exclusive cumsum."
   }
   attr {
     name: "reverse"
@@ -5745,6 +5752,7 @@ op {
     default_value {
       b: false
     }
+    description: "A `bool` (default: False)."
   }
   attr {
     name: "T"
@@ -7615,7 +7623,7 @@ op {
   }
   input_arg {
     name: "dim"
-    description: "0-D (scalar). Specifies the dimension index at which to\nexpand the shape of `input`."
+    description: "0-D (scalar). Specifies the dimension index at which to\nexpand the shape of `input`. Must be in the range\n`[-rank(input) - 1, rank(input)]`."
     type_attr: "Tdim"
   }
   output_arg {
@@ -12325,7 +12333,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -13102,7 +13110,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -13293,7 +13301,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -15662,7 +15670,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -21708,7 +21716,7 @@ op {
   }
   input_arg {
     name: "axis"
-    description: "1-D. The indices of the dimensions to reverse."
+    description: "1-D. The indices of the dimensions to reverse. Must be in the range\n`[-rank(tensor), rank(tensor))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -27260,7 +27268,7 @@ op {
       list {
       }
     }
-    description: "If specified, only squeezes the dimensions listed. The dimension\nindex starts at 0. It is an error to squeeze a dimension that is not 1."
+    description: "If specified, only squeezes the dimensions listed. The dimension\nindex starts at 0. It is an error to squeeze a dimension that is not 1. Must\nbe in the range `[-rank(input), rank(input))`."
     has_minimum: true
   }
   summary: "Removes dimensions of size 1 from the shape of a tensor."
@@ -28250,7 +28258,7 @@ op {
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce."
+    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {

From 2c2277c9c08794655ec0e2ff08356693032dafaa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 24 Jul 2017 17:45:35 -0700
Subject: [PATCH 02/56] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 163014834
---
 tensorflow/go/op/wrappers.go | 60 +++++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 10388509a9a..61c924ac22b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1130,7 +1130,8 @@ type SqueezeAttr func(optionalAttr)
 // SqueezeSqueezeDims sets the optional squeeze_dims attribute to value.
 //
 // value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1.
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
@@ -12819,7 +12820,8 @@ func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 //
 // Arguments:
 //	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
 //
 // Returns The same shape as `tensor`.
 func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
@@ -16147,6 +16149,8 @@ func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf
 type CumprodAttr func(optionalAttr)
 
 // CumprodExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumprod.
 // If not specified, defaults to false
 func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
@@ -16155,6 +16159,8 @@ func CumprodExclusive(value bool) CumprodAttr {
 }
 
 // CumprodReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
 // If not specified, defaults to false
 func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
@@ -16192,6 +16198,13 @@ func CumprodReverse(value bool) CumprodAttr {
 // ```python
 // tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
 // ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
 func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -16420,6 +16433,8 @@ func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, ma
 type CumsumAttr func(optionalAttr)
 
 // CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
 // If not specified, defaults to false
 func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
@@ -16428,6 +16443,8 @@ func CumsumExclusive(value bool) CumsumAttr {
 }
 
 // CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
 // If not specified, defaults to false
 func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
@@ -16465,6 +16482,13 @@ func CumsumReverse(value bool) CumsumAttr {
 // ```python
 // tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
 // ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
 func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -18063,7 +18087,8 @@ func AnyKeepDims(value bool) AnyAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
@@ -19213,7 +19238,8 @@ func ProdKeepDims(value bool) ProdAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
@@ -20258,7 +20284,8 @@ func MaxKeepDims(value bool) MaxAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
@@ -20583,7 +20610,8 @@ func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 // Arguments:
 //
 //	dim: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`.
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
 //
 // Returns Contains the same data as `input`, but its shape has an additional
 // dimension of size 1 added.
@@ -20623,7 +20651,8 @@ func AllKeepDims(value bool) AllAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
@@ -21665,8 +21694,8 @@ func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 //
 // Arguments:
 //
-//	dimension: int32 or int64, 0 <= dimension < rank(input).  Describes
-// which dimension of the input Tensor to reduce across. For vectors,
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
 // use dimension = 0.
 func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
@@ -22716,7 +22745,8 @@ func MeanKeepDims(value bool) MeanAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
@@ -22856,7 +22886,8 @@ func MinKeepDims(value bool) MinAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
@@ -22914,8 +22945,8 @@ func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 //
 // Arguments:
 //
-//	dimension: int32 or int64, 0 <= dimension < rank(input).  Describes
-// which dimension of the input Tensor to reduce across. For vectors,
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
 // use dimension = 0.
 func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
@@ -24724,7 +24755,8 @@ func SumKeepDims(value bool) SumAttr {
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	reduction_indices: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
 func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {

From 73b120ea3b517b6af2267ca078bf571f966fd606 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Mon, 24 Jul 2017 18:34:01 -0700
Subject: [PATCH 03/56] Removing session reset since destroying the session
 object would delete its variables as well. Resetting session might
 unintentionally close other sessions in the same process.

PiperOrigin-RevId: 163019166
---
 tensorflow/core/grappler/clusters/BUILD       |   2 +
 .../core/grappler/clusters/single_machine.cc  |   8 +-
 .../grappler/clusters/single_machine_test.cc  | 121 ++++++++++++++++++
 3 files changed, 125 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 667023845cd..e7230b37543 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -114,7 +114,9 @@ cc_test(
     deps = [
         ":single_machine",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index a1531f1cfcf..3481b2b158d 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -73,8 +73,6 @@ SingleMachine::~SingleMachine() {
   // when we delete the session.
   thread_pool_.reset();
 
-  Reset(options_, {}).IgnoreError();
-
   CHECK(already_created);
   already_created = false;
 }
@@ -277,11 +275,9 @@ Status SingleMachine::ResetSession() {
     // Make sure the session is properly closed
     TF_RETURN_IF_ERROR(Shutdown());
 
-    // We need to Reset the session to ensure that all the variables are
-    // deleted. But first we need to delete the session since Reset()
-    // deletes some of the containers referenced by the session.
+    // Destroying the object deletes all its varibles as well. This is only true
+    // for DirectSession.
     session_.reset();
-    TF_RETURN_IF_ERROR(Reset(options_, {}));
   }
 
   LOG(INFO) << "Starting new session";
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index b73b084793e..d7e2827afc9 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -24,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -349,6 +353,7 @@ TEST_F(SingleMachineTest, InitializationMemory) {
 }
 
 namespace {
+
 template <class T>
 inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
   AttrValue attr_value;
@@ -463,6 +468,122 @@ TEST_F(SingleMachineTest, PersistentMemory) {
   EXPECT_TRUE(found_hashtable);
 }
 
+namespace {
+
+SessionOptions GetSessionOption(int num_cpu_cores, int num_gpus) {
+  SessionOptions options;
+  // Copied from single_machine.h
+  (*options.config.mutable_device_count())["CPU"] = 1;
+  if (num_gpus > 0) {
+    (*options.config.mutable_device_count())["GPU"] = num_gpus;
+  }
+  CHECK_GE(num_cpu_cores, 1);
+  options.config.set_intra_op_parallelism_threads(num_cpu_cores);
+  options.config.add_session_inter_op_thread_pool()->set_num_threads(
+      num_cpu_cores);
+  return options;
+}
+
+Status GetDeviceMemoryStats(
+    const SessionOptions& session_option,
+    std::unordered_map<string, AllocatorStats>* allocator_stats_by_device) {
+  std::vector<Device*> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(session_option,
+                                               "" /* name_prefix */, &devices));
+  allocator_stats_by_device->clear();
+  for (Device* device : devices) {
+    AllocatorStats stats;
+    auto* allocator = device->GetAllocator(AllocatorAttributes());
+    if (!allocator->TracksAllocationSizes()) {
+      return Status(error::INVALID_ARGUMENT,
+                    "Tracking allocation is not enabled.");
+    }
+    allocator->GetStats(&stats);
+    (*allocator_stats_by_device)[device->name()] = stats;
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // Add a variable and initializer.
+  Output a = ops::Variable(s.WithOpName("a"), TensorShape({128, 256}),
+                           DataType::DT_FLOAT);
+  Output a_init =
+      ops::RandomNormal(s.WithOpName("a/init"), {128, 256}, DataType::DT_FLOAT);
+  Output a_init_assign = ops::Assign(s.WithOpName("a/init/assign"), a, a_init);
+
+  // Add a resource variable.
+  Output b =
+      ops::VarHandleOp(s.WithOpName("b"), DataType::DT_FLOAT, {256, 512});
+  Output b_read =
+      ops::ReadVariableOp(s.WithOpName("b/read"), b, DataType::DT_FLOAT);
+  Output b_init =
+      ops::RandomNormal(s.WithOpName("b/init"), {256, 512}, DataType::DT_FLOAT);
+  auto b_init_assign =
+      ops::AssignVariableOp(s.WithOpName("b/init/assign"), b, b_init);
+
+  // Add a queue.
+  ops::FIFOQueue queue(s.WithOpName("queue"), {DataType::DT_STRING});
+  Output some_string =
+      ops::Const(s.WithOpName("some_string"), string("nothing"));
+  ops::QueueEnqueue enqueue(s.WithOpName("enqueue"), queue, {some_string});
+  ops::QueueDequeue dequeue(s.WithOpName("dequeue"), queue,
+                            {DataType::DT_STRING});
+
+  // Add a IdentityReader.
+  ops::IdentityReader reader(s.WithOpName("identity_reader"));
+  ops::ReaderRead read(s.WithOpName("read_from_queue"), reader, queue);
+
+  Output var_mul = ops::MatMul(s.WithOpName("var_matmul"), a, b_read);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  QueueRunnerDef queue_runner;
+  queue_runner.set_queue_name("queue");
+  *queue_runner.add_enqueue_op_name() = "enqueue";
+  item.queue_runners.push_back(queue_runner);
+
+  item.init_ops.push_back("a/init/assign");
+  item.init_ops.push_back("b/init/assign");
+  item.fetch.push_back("var_matmul");
+  item.fetch.push_back("dequeue");
+
+  // Run the graph
+  TF_CHECK_OK(cluster_->Initialize(item));
+  EnableCPUAllocatorStats(true);
+
+  SessionOptions options =
+      GetSessionOption(3 /* cpu cores */, 0 /* num gpus */);
+  std::unordered_map<string, AllocatorStats> device_memory_before;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_before));
+  EXPECT_EQ(device_memory_before.size(), 1);
+
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check there is memory that is not released.
+  std::unordered_map<string, AllocatorStats> device_memory;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory));
+  EXPECT_EQ(device_memory.size(), 1);
+  EXPECT_GT(device_memory.begin()->second.bytes_in_use, 0);
+
+  // Reset cluster_ would release all memory.
+  cluster_.reset();
+  std::unordered_map<string, AllocatorStats> device_memory_after;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_after));
+
+  // Check memory used by resources are released after cluster destruction.
+  EXPECT_EQ(device_memory_before.size(), 1);
+  EXPECT_EQ(device_memory_after.size(), 1);
+  EXPECT_EQ(device_memory_before.begin()->second.bytes_in_use, 0);
+  EXPECT_EQ(device_memory_after.begin()->second.bytes_in_use, 0);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow

From d1a9ea61ef8271b3d2fe273a68ff5940fcba7ccd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 00:05:41 -0700
Subject: [PATCH 04/56] [XLA] Teach CPU and GPU compilers to optionally invoke
 the HLO insert-reduce-precision-operations pass.

This also required a few additions and fixups.  We add pieces to ReducePrecisionInsertion to translate between the protocol-buffer representation of the pass options and the predicate-function actually used in the pass.  To facilitate this translation, we also add a function to HloOpcode to return the number of opcodes so that we can iterate over the whole set easily.

PiperOrigin-RevId: 163037250
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 tensorflow/compiler/xla/service/cpu/BUILD     |  1 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  | 18 ++++
 tensorflow/compiler/xla/service/gpu/BUILD     |  1 +
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 31 ++++++-
 tensorflow/compiler/xla/service/hlo_opcode.h  |  5 ++
 .../xla/service/reduce_precision_insertion.cc | 39 +++++++-
 .../xla/service/reduce_precision_insertion.h  | 22 ++++-
 .../xla/tests/reduce_precision_test.cc        | 90 ++++++++++++++++++-
 tensorflow/compiler/xla/xla.proto             | 23 +++++
 10 files changed, 221 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 696dc285640..a4612bb6c12 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1945,6 +1945,7 @@ cc_library(
         ":buffer_liveness",
         ":hlo",
         ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 7248cb5f4c0..2ca4af67cd5 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -72,6 +72,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:inliner",
+        "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",  # fixdeps: keep
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 6d819355c4c..b86342d0b3e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -74,6 +74,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -253,6 +254,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module) {
   HloPassPipeline pipeline("CPU");
   pipeline.AddInvariantChecker<HloVerifier>();
 
+  for (const auto& reduce_precision_options :
+       module->config().debug_options().hlo_reduce_precision_options()) {
+    if (reduce_precision_options.pass_timing() ==
+        HloReducePrecisionOptions::BEFORE_OP_FUSION) {
+      pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options);
+    }
+  }
+
   // TODO(b/35786417): Re-enable inliner pass after fixing the bug and deciding
   // where we will take this pass in future.
   // pipeline.AddPass<Inliner>();
@@ -278,6 +287,15 @@ Status CpuCompiler::RunHloPasses(HloModule* module) {
       TransposeFolding::NeverFoldTranspose);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
+
+  for (const auto& reduce_precision_options :
+       module->config().debug_options().hlo_reduce_precision_options()) {
+    if (reduce_precision_options.pass_timing() ==
+        HloReducePrecisionOptions::AFTER_OP_FUSION) {
+      pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options);
+    }
+  }
+
   pipeline.AddPass<CpuLayoutAssignment>(
       module->mutable_entry_computation_layout());
   // The LayoutAssignment pass may leave behind kCopy instructions which are
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index fa95e234992..cdd7c8187c9 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -432,6 +432,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto_util",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index d60c45a5c3a..2acf95084a9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -123,6 +124,15 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
   {
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>();
+
+    for (const auto& reduce_precision_options :
+         hlo_module->config().debug_options().hlo_reduce_precision_options()) {
+      if (reduce_precision_options.pass_timing() ==
+          HloReducePrecisionOptions::BEFORE_OP_FUSION) {
+        pipeline.AddPass<ReducePrecisionInsertion>(reduce_precision_options);
+      }
+    }
+
     {
       auto& pass =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
@@ -149,8 +159,27 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
     fusion.AddPass<FusionMerger>();
-    return fusion.Run(hlo_module).status();
+    TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+
+    HloPassPipeline reduce_pipeline("reduce-precision");
+    for (const auto& reduce_precision_options :
+         hlo_module->config().debug_options().hlo_reduce_precision_options()) {
+      if (reduce_precision_options.pass_timing() ==
+          HloReducePrecisionOptions::AFTER_OP_FUSION) {
+        reduce_pipeline.AddPass<ReducePrecisionInsertion>(
+            reduce_precision_options);
+      }
+    }
+    StatusOr<bool> reduce_result = reduce_pipeline.Run(hlo_module);
+    TF_RETURN_IF_ERROR(reduce_result.status());
+
+    if (reduce_result.ValueOrDie()) {
+      // Do another fusion pass, with the expectation that we may be able to
+      // fuse the new ReducePrecision operations.
+      TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
+    }
   }
+  return tensorflow::Status::OK();
 }
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 358e611d57f..8a6376b2d1c 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -112,6 +112,11 @@ bool HloOpcodeIsComparison(HloOpcode opcode);
 // Returns true iff the given opcode has variadic operands.
 bool HloOpcodeIsVariadic(HloOpcode opcode);
 
+// Returns the number of HloOpcode values.
+inline const uint32_t HloOpcodeCount() {
+  return static_cast<uint32_t>(HloOpcode::kWhile) + 1;
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index dafefdc4910..e083226b14d 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -30,14 +31,15 @@ StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) {
     for (auto& instruction : computation->instructions()) {
       VLOG(3) << "Visited instruction: " << instruction->ToString();
 
-      // For now, ReducePrecision is only implemented for F32 data, so this
+      // For now, ReducePrecision is only implemented for F32 arrays, so this
       // ignore instructions that produce other data.  In particular, this
       // currently ignores instructions producing tuples, even if those tuples
-      // contain F32 data inside them.  The assumption is that in most cases
+      // contain F32 arrays inside them.  The assumption is that in most cases
       // equivalent behavior can be obtained by adding ReducePrecision
-      // instructions after the instructions that pull the F32 data out of the
-      // tuples.
+      // instructions after the instructions that pull the F32 arrays out of
+      // the tuples.
       if (instruction->shape().element_type() == PrimitiveType::F32 &&
+          !ShapeUtil::IsScalar(instruction->shape()) &&
           should_reduce_output_precision_(instruction->opcode())) {
         instructions_to_suffix.push_back(instruction.get());
       }
@@ -58,4 +60,33 @@ StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) {
   return changed;
 }
 
+ReducePrecisionInsertion::OpcodeFilterFunction
+ReducePrecisionInsertion::make_filter_function(
+    const HloReducePrecisionOptions& reduce_precision_options) {
+  // Implement the filter function with a lookup table.
+  std::vector<bool> filter(HloOpcodeCount(), false);
+  for (const auto& opcode : reduce_precision_options.opcodes_to_suffix()) {
+    filter[opcode] = true;
+  }
+  return [filter](const HloOpcode opcode) {
+    return filter[static_cast<unsigned int>(opcode)];
+  };
+}
+
+HloReducePrecisionOptions ReducePrecisionInsertion::make_options_proto(
+    const HloReducePrecisionOptions::PassTiming pass_timing,
+    const int exponent_bits, const int mantissa_bits,
+    const OpcodeFilterFunction& should_reduce_output_precision) {
+  HloReducePrecisionOptions options;
+  options.set_pass_timing(pass_timing);
+  options.set_exponent_bits(exponent_bits);
+  options.set_mantissa_bits(mantissa_bits);
+  for (uint32_t opcode = 0; opcode < HloOpcodeCount(); opcode++) {
+    if (should_reduce_output_precision(static_cast<HloOpcode>(opcode))) {
+      options.add_opcodes_to_suffix(opcode);
+    }
+  }
+  return options;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
index e9c8bba0313..34b865b9ced 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -42,6 +42,17 @@ class ReducePrecisionInsertion : public HloPassInterface {
       : exponent_bits_(exponent_bits),
         mantissa_bits_(mantissa_bits),
         should_reduce_output_precision_(should_reduce_output_precision) {}
+
+  // Version of the constructor that takes an HloReducePrecisionOptions proto
+  // rather than explicitly-enumerated parameters, for convenience when
+  // creating passes based on DebugOptions.
+  explicit ReducePrecisionInsertion(
+      const HloReducePrecisionOptions& reduce_precision_options)
+      : exponent_bits_(reduce_precision_options.exponent_bits()),
+        mantissa_bits_(reduce_precision_options.mantissa_bits()),
+        should_reduce_output_precision_(
+            make_filter_function(reduce_precision_options)) {}
+
   ~ReducePrecisionInsertion() override{};
 
   tensorflow::StringPiece name() const override {
@@ -52,6 +63,15 @@ class ReducePrecisionInsertion : public HloPassInterface {
   // (reduce-precision instructions were inserted).
   StatusOr<bool> Run(HloModule* module) override;
 
+  // Convert between the (inconvenient) xla.proto HloReducePrecisionOptions
+  // representation and OpcodeFilterFunction functions.
+  static OpcodeFilterFunction make_filter_function(
+      const HloReducePrecisionOptions& reduce_precision_options);
+  static HloReducePrecisionOptions make_options_proto(
+      const HloReducePrecisionOptions::PassTiming pass_timing,
+      const int exponent_bits, const int mantissa_bits,
+      const OpcodeFilterFunction& should_reduce_output_precision);
+
  private:
   // Parameters for the precision reduction to be added.
   const int exponent_bits_;
@@ -59,7 +79,7 @@ class ReducePrecisionInsertion : public HloPassInterface {
 
   // Function to determine (from the opcode) whether a given instruction should
   // have a reduce-precision instruction inserted in its output stream.
-  const OpcodeFilterFunction& should_reduce_output_precision_;
+  const OpcodeFilterFunction should_reduce_output_precision_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 48212dc7d18..527205bbb0d 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -39,8 +40,11 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReducePrecisionTest : public ClientLibraryTestBase,
-                            public ::testing::WithParamInterface<int> {};
+// Tests to confirm that the ReducePrecision operation produces the expected
+// numerical values.
+class ReducePrecisionAccuracyTest : public ClientLibraryTestBase,
+                                    public ::testing::WithParamInterface<int> {
+};
 
 // For reduction to IEEE-f16, we want to test the following cases, in both
 // positive and negative variants.  (Note: IEEE-f16 is 5 exponent bits and 10
@@ -201,7 +205,7 @@ static const uint32_t test_values[][4] = {
         FPVAL(11111111, 1111111111, 1111111111111)   // NaN
     }};
 
-XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) {
+XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
   int index = GetParam();
   int exponent_bits = exponent_sizes[index];
   int mantissa_bits = mantissa_sizes[index];
@@ -238,9 +242,87 @@ XLA_TEST_P(ReducePrecisionTest, ReducePrecisionF32) {
   ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()});
 }
 
-INSTANTIATE_TEST_CASE_P(ReducePrecisionTest, ReducePrecisionTest,
+INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest,
+                        ReducePrecisionAccuracyTest,
                         ::testing::Values(0, 1, 2, 3), TestDataToString);
 
+// Tests to confirm that the compiler optimization functions add the expected
+// ReducePrecisionInsertion passes.
+class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+  std::unique_ptr<GlobalData> a_data =
+      client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+  auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+  // Abs doesn't affect resolution.
+  auto abs = builder.Abs(a);
+
+  // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the
+  // reduce-precision operation showed up in the correct place in the
+  // graph.
+  auto log = builder.Log(abs);
+
+  // Insert precision-reduction after the Abs(x) operation, rounding that
+  // result to exactly 1.0f.
+  auto reduce_precision_pass = execution_options_.mutable_debug_options()
+                                   ->add_hlo_reduce_precision_options();
+  *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+      HloReducePrecisionOptions::BEFORE_OP_FUSION, 5, 10,
+      [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
+
+  ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()});
+}
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+  std::unique_ptr<GlobalData> a_data =
+      client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+  auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+  // These two operations should be fused by any reasonable backend.
+  auto abs = builder.Abs(a);
+  auto neg = builder.Neg(abs);
+
+  // Add a pass after operation fusion, suffixing kAbs operations.  This
+  // should not see into the fusion nodes and thus should not affect the
+  // result.
+  auto reduce_precision_pass = execution_options_.mutable_debug_options()
+                                   ->add_hlo_reduce_precision_options();
+  *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+      HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10,
+      [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
+
+  ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
+}
+
+XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
+  std::unique_ptr<GlobalData> a_data =
+      client_->TransferToServer(*a_literal).ConsumeValueOrDie();
+  auto a = builder.Parameter(0, a_literal->shape(), "a");
+
+  // These two operations should be fused by any reasonable backend.
+  auto abs = builder.Abs(a);
+  auto neg = builder.Neg(abs);
+
+  // Add a pass after operation fusion, suffixing kFusion operations.
+  auto reduce_precision_pass = execution_options_.mutable_debug_options()
+                                   ->add_hlo_reduce_precision_options();
+  *reduce_precision_pass = ReducePrecisionInsertion::make_options_proto(
+      HloReducePrecisionOptions::AFTER_OP_FUSION, 5, 10,
+      [](const HloOpcode opcode) { return opcode == HloOpcode::kFusion; });
+
+  ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 00fb7f12b85..be4e00f63cc 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -20,6 +20,24 @@ import "tensorflow/compiler/xla/service/session.proto";
 
 package xla;
 
+// Options for the HLO insert-reduce-precision-operations pass.
+message HloReducePrecisionOptions {
+  // When to run the pass.
+  enum PassTiming {
+    BEFORE_OP_FUSION = 0;
+    AFTER_OP_FUSION = 1;
+  }
+  PassTiming pass_timing = 1;
+
+  // Exponent and mantissa bit counts for the reduced precision.
+  uint32 exponent_bits = 2;
+  uint32 mantissa_bits = 3;
+
+  // Opcodes for operations that should be suffixed with reduced-precision
+  // operations.
+  repeated uint32 opcodes_to_suffix = 4;
+}
+
 // Debugging options for XLA. These options may change at any time - there are
 // no guarantees about backward or forward compatibility for these fields.
 message DebugOptions {
@@ -112,6 +130,11 @@ message DebugOptions {
   // the generated IR.
   bool xla_llvm_enable_invariant_load_metadata = 72;
 
+  // Options for inserting reduce-precision operations for numerical
+  // experimentation.  This is a repeated field, as we may want to have
+  // multiple passes with different parameters.
+  repeated HloReducePrecisionOptions hlo_reduce_precision_options = 80;
+
   // This is used by ClientLibraryTestBase::ComputeAndCompare*. If true, the
   // computation will run n! times with all permunations of layouts for the
   // output shape in rank n. For example, with a 3D shape, all permutations of

From 6610b3ec6bdb1a8843070a9fea6e4612681b9318 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 25 Jul 2017 00:31:16 -0700
Subject: [PATCH 05/56] Refactor HLO graph dumping.

This also makes a few minor cosmetic changes, like moving the fusion
type out of the fusion node and into the out-of-line computation and
adjusting the arrow labels that we use to indicate operand numbers.

PiperOrigin-RevId: 163038795
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 932 +++++++++---------
 1 file changed, 488 insertions(+), 444 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index acd26c4e31c..fcad1188a7f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -48,36 +48,36 @@ using ::tensorflow::Env;
 using ::tensorflow::gtl::nullopt;
 using ::tensorflow::gtl::optional;
 using ::tensorflow::io::JoinPath;
-using ::tensorflow::strings::Appendf;
-using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 using ::tensorflow::str_util::Join;
+using ::tensorflow::str_util::StringReplace;
 using ::tensorflow::WriteStringToFile;
 
 namespace xla {
 namespace hlo_graph_dumper {
 namespace {
 
-// Node color schemes, used by NodeColorAttributes.
-enum ColorScheme {
-  kBlue,
-  kBrown,
-  kDarkBlue,
-  kDarkGreen,
-  kDarkRed,
-  kGray,
-  kGreen,
-  kOrange,
-  kPurple,
-  kRed,
-  kWhite,
-  kYellow,
-
-  // Causes the node's border to be a dashed line, and its content to be gray
-  // text on a white background, suggesting that this is an "unimportant" node.
-  kDashedBorder,
+// Helpers for Printf and Appendf.
+template <typename T>
+struct PrintfConvert {
+  const T& operator()(const T& t) const { return t; }
 };
+template <>
+struct PrintfConvert<string> {
+  const char* operator()(const string& s) const { return s.c_str(); }
+};
+
+// Like tensorflow::strings::Printf/Appendf, but you don't need to call c_str()
+// on strings.
+template <typename... Ts>
+string Printf(const char* fmt, const Ts&... ts) {
+  return tensorflow::strings::Printf(fmt, PrintfConvert<Ts>()(ts)...);
+}
+template <typename... Ts>
+void Appendf(string* s, const char* fmt, const Ts&... ts) {
+  tensorflow::strings::Appendf(s, fmt, PrintfConvert<Ts>()(ts)...);
+}
 
 // Used to indicate how we should treat a given HLOInstruction in the graph.
 // should we treat it like normal, hide it, and so on?
@@ -123,6 +123,26 @@ class NodeFilter {
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
 };
 
+// Node color schemes, used by NodeColorAttributes.
+enum ColorScheme {
+  kBlue,
+  kBrown,
+  kDarkBlue,
+  kDarkGreen,
+  kDarkRed,
+  kGray,
+  kGreen,
+  kOrange,
+  kPurple,
+  kRed,
+  kWhite,
+  kYellow,
+
+  // Causes the node's border to be a dashed line, and its content to be gray
+  // text on a white background, suggesting that this is an "unimportant" node.
+  kDashedBorder,
+};
+
 // Given a ColorScheme, returns an attribute string for a node of that color.
 // Sets the node's style and fill/stroke/text colors.
 //
@@ -170,19 +190,8 @@ string NodeColorAttributes(ColorScheme color) {
 // Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
 // graphviz HTML-like string.
 string HtmlLikeStringSanitize(tensorflow::StringPiece s) {
-  return tensorflow::str_util::StringReplace(
-      tensorflow::str_util::StringReplace(s, "<", "&lt;", /*replace_all=*/true),
-      ">", "&gt;", /*replace_all=*/true);
-}
-
-// Returns the dot graph identifier for the given instruction.
-string InstructionId(const HloInstruction* instruction) {
-  return Printf("%lld", reinterpret_cast<uint64>(instruction));
-}
-
-// Returns the dot graph identifier for the given computation.
-string ComputationId(const HloComputation* computation) {
-  return Printf("%lld", reinterpret_cast<uint64>(computation));
+  return StringReplace(StringReplace(s, "<", "&lt;", /*replace_all=*/true), ">",
+                       "&gt;", /*replace_all=*/true);
 }
 
 // Tries to generates a human-readable one-word description of the given
@@ -249,439 +258,471 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
   }
 }
 
-// Returns the dot graph edges and nodes for the given instruction sequence.
-// Edges which extend between computations are added to the vector
-// intercomputation_edges. This is necessary because graphviz does not render
-// the graph properly unless these inter-computation edges appear after all
-// subgraph statements.
-string InstructionSequenceGraph(
-    const std::list<std::unique_ptr<HloInstruction>>& instructions,
-    bool show_addresses, bool show_layouts,
-    std::vector<string>* intercomputation_edges,
-    const HloExecutionProfile* hlo_execution_profile,
-    const NodeFilter& filter) {
-  string graph_body;
+class HloDotDumper {
+ public:
+  HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label,
+               bool show_addresses, bool show_layouts,
+               const HloExecutionProfile* profile, NodeFilter filter)
+      : computation_(computation),
+        label_(label.ToString()),
+        show_addresses_(show_addresses),
+        show_layouts_(show_layouts),
+        profile_(profile),
+        filter_(std::move(filter)) {}
 
-  for (auto& instruction : instructions) {
-    if (!filter.Show(instruction.get())) {
-      continue;
-    }
+  string Dump();
 
-    // We don't display constants as separate nodes; they're merged into their
-    // users.
-    if (instruction->opcode() == HloOpcode::kConstant) {
-      continue;
-    }
-
-    ColorScheme color = kYellow;
-    string shape = "box";
-
-    // Build the first line or two of the node, containing its name and opcode
-    // (if the opcode isn't redundant with the name).
-    string name;
-    if (instruction->opcode() == HloOpcode::kParameter) {
-      // If we have a parameter, put the param number in the name.
-      name = StrCat("<b>Parameter ", instruction->parameter_number(),
-                    "</b><br/>", HtmlLikeStringSanitize(instruction->name()));
-    } else if (tensorflow::StringPiece(instruction->name())
-                   .starts_with(
-                       StrCat("%", instruction->ExtendedOpcodeStr()))) {
-      // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
-      // an add instruction.  In this case we render just the name.
-      name = StrCat("<b>", HtmlLikeStringSanitize(instruction->name()), "</b>");
-    } else if (instruction->opcode() == HloOpcode::kFusion &&
-               tensorflow::StringPiece(instruction->name())
-                   .starts_with(
-                       StrCat("%", HloOpcodeString(instruction->opcode())))) {
-      // Fusion nodes are usually named e.g. "%fusion.5".  We render these as
-      // e.g. "%fusion.5<br/>input fusion".
-      name = StrCat("<b>", HtmlLikeStringSanitize(instruction->name()),
-                    "</b><br/>",
-                    HtmlLikeStringSanitize(instruction->ToCategory()));
-    } else {
-      // If the name does not contain the opcode, render both.
-      name = StrCat("<b>",
-                    HtmlLikeStringSanitize(instruction->ExtendedOpcodeStr()),
-                    "</b><br/>", HtmlLikeStringSanitize(instruction->name()));
-    }
-
-    if (HloOpcode::kConvolution == instruction->opcode()) {
-      StrAppend(
-          &name, "<br/>",
-          HtmlLikeStringSanitize(
-              instruction->ConvolutionDimensionNumbersToString()),
-          "<br/>",
-          HtmlLikeStringSanitize(window_util::ToString(instruction->window())));
-    }
-
-    if (!instruction->metadata().op_name().empty()) {
-      StrAppend(&name, "<br/>",
-                HtmlLikeStringSanitize(instruction->metadata().op_name()));
-    }
-    if (!instruction->metadata().source_file().empty() &&
-        instruction->metadata().source_line() != 0) {
-      StrAppend(&name, "<br/>", instruction->metadata().source_file(), ":",
-                instruction->metadata().source_line());
-    }
-
-    // Pick different colors or shapes for instructions which are particularly
-    // expensive (eg, dot) and those which are unusual in some way or unique
-    // (eg, parameter).
-    switch (instruction->opcode()) {
-      // "Normal" instructions. Mostly cheap and elementwise. No call to
-      // embedded computations. In this case, use default color, shape and
-      // label.
-      case HloOpcode::kAbs:
-      case HloOpcode::kAdd:
-      case HloOpcode::kCeil:
-      case HloOpcode::kClamp:
-      case HloOpcode::kConvert:
-      case HloOpcode::kCos:
-      case HloOpcode::kDivide:
-      case HloOpcode::kEq:
-      case HloOpcode::kExp:
-      case HloOpcode::kFloor:
-      case HloOpcode::kGe:
-      case HloOpcode::kGt:
-      case HloOpcode::kIndex:
-      case HloOpcode::kIsFinite:
-      case HloOpcode::kLe:
-      case HloOpcode::kLog:
-      case HloOpcode::kLogicalAnd:
-      case HloOpcode::kLogicalNot:
-      case HloOpcode::kLogicalOr:
-      case HloOpcode::kLt:
-      case HloOpcode::kMaximum:
-      case HloOpcode::kMinimum:
-      case HloOpcode::kMultiply:
-      case HloOpcode::kNe:
-      case HloOpcode::kNegate:
-      case HloOpcode::kPower:
-      case HloOpcode::kRemainder:
-      case HloOpcode::kSelect:
-      case HloOpcode::kSign:
-      case HloOpcode::kSin:
-      case HloOpcode::kSlice:
-      case HloOpcode::kSort:
-      case HloOpcode::kSubtract:
-      case HloOpcode::kTanh:
-        break;
-      case HloOpcode::kRng:
-        StrAppend(&name, "<br/>",
-                  RandomDistribution_Name(instruction->random_distribution()));
-        break;
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kTranspose:
-        StrAppend(&name, "<br/>", "dims={",
-                  Join(instruction->dimensions(), ","), "}");
-        break;
-      case HloOpcode::kBitcast:
-      case HloOpcode::kTuple:
-      case HloOpcode::kTrace:
-        color = kWhite;
-        break;
-      case HloOpcode::kGetTupleElement:
-        color = kWhite;
-        StrAppend(&name, "<br/>index=", instruction->tuple_index());
-        break;
-      case HloOpcode::kConcatenate:
-      case HloOpcode::kCopy:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
-      case HloOpcode::kPad:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReverse:
-      case HloOpcode::kUpdate:
-        color = kGreen;
-        break;
-      case HloOpcode::kConvolution:
-      case HloOpcode::kDot:
-        color = kDarkBlue;
-        break;
-      case HloOpcode::kParameter:
-        color = kOrange;
-        break;
-      case HloOpcode::kBatchNormTraining:
-        StrAppend(&name, " feature_index=", instruction->feature_index());
-        color = kPurple;
-        break;
-      case HloOpcode::kBatchNormGrad:
-        StrAppend(&name, " feature_index=", instruction->feature_index());
-        color = kPurple;
-        break;
-      case HloOpcode::kReduce:
-        StrAppend(&name, " dims=", Join(instruction->dimensions(), ","));
-        color = kPurple;
-        break;
-      case HloOpcode::kSelectAndScatter:
-      case HloOpcode::kReduceWindow:
-        color = kPurple;
-        break;
-      case HloOpcode::kWhile:
-        shape = "ellipse";
-        color = kDarkGreen;
-        break;
-      case HloOpcode::kMap:
-      case HloOpcode::kFusion:
-        color = kGray;
-        break;
-      case HloOpcode::kSend:
-      case HloOpcode::kRecv:
-      case HloOpcode::kInfeed:
-      case HloOpcode::kOutfeed:
-      case HloOpcode::kCrossReplicaSum:
-        color = kBrown;
-        break;
-      case HloOpcode::kCall:
-        color = kDarkGreen;
-        break;
-      case HloOpcode::kCustomCall:
-        color = kDarkGreen;
-        StrAppend(&name, "<br/>",
-                  "custom_call_target=", instruction->custom_call_target());
-        break;
-      case HloOpcode::kReducePrecision:
-        // Make ReducePrecision ops a bit more visible, since typically they
-        // will be inserted as modifications to an existing graph.
-        color = kRed;
-        break;
-      case HloOpcode::kConstant:
-        LOG(FATAL) << "Constants don't get their own nodes in the graph.";
-    }
-
-    // Create instruction node with appropriate label, shape, and color.
-    // label is interpreted as an HTML-like string, so newlines must be
-    // delimited with <br/>, rather than \n.
-    string label =
-        StrCat(name, "<br/>", ShapeUtil::HumanString(instruction->shape()));
-
-    if (show_addresses) {
-      Appendf(&label, "<br/>[%p]", instruction.get());
-    }
-    if (show_layouts && LayoutUtil::HasLayout(instruction->shape())) {
-      string layout_string;
-      if (ShapeUtil::IsTuple(instruction->shape())) {
-        // For tuples, emit the full shape because the layout of a tuple is not
-        // represented in a single Layout field.
-        layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
-      } else {
-        layout_string =
-            Join(instruction->shape().layout().minor_to_major(), ",");
-      }
-      StrAppend(&label, "<br/>layout={", layout_string, "}");
-    }
-    if (hlo_execution_profile != nullptr) {
-      auto hlo_cycles_executed =
-          hlo_execution_profile->GetProfileResult(*instruction);
-      auto total_cycles_executed =
-          hlo_execution_profile->total_cycles_executed(*instruction->parent());
-      if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
-        Appendf(&label, "<br/>%% of cycles executed=%.2f",
-                (static_cast<double>(hlo_cycles_executed) /
-                 static_cast<double>(total_cycles_executed)) *
-                    100);
-      }
-    }
-
-    // If this node's operands are omitted, style it accordingly.
-    if (filter.SomeOrAllOperandsOmitted(instruction.get())) {
-      color = kDashedBorder;
-    }
-
-    // If this node is highlighted, override its formatting.
-    if (filter.Highlight(instruction.get())) {
-      shape = "diamond";
-      color = kDarkRed;
-    }
-
-    // Create edges from the instruction's operands to the instruction.
-    if (!filter.OmitOperands(instruction.get())) {
-      int64 operand_number = 0;
-      for (auto* operand : instruction->operands()) {
-        if (!filter.Show(operand) ||
-            operand->opcode() == HloOpcode::kConstant) {
-          ++operand_number;
-          continue;
-        }
-        Appendf(&graph_body, "%s -> %s", InstructionId(operand).c_str(),
-                InstructionId(instruction.get()).c_str());
-        if (instruction->operand_count() > 1) {
-          Appendf(&graph_body, " [headlabel=\"%lld\",labeldistance=2]",
-                  operand_number);
-        }
-        StrAppend(&graph_body, ";\n");
-        ++operand_number;
-      }
-
-      // Fusion nodes are handled specially because they contain nested
-      // expressions.
-      if (instruction->opcode() == HloOpcode::kFusion) {
-        string cluster_name =
-            StrCat("cluster_", InstructionId(instruction.get()));
-        StrAppend(&graph_body, "subgraph ", cluster_name, " {\n");
-        StrAppend(&graph_body, "label=<fused expression for <b>",
-                  HtmlLikeStringSanitize(instruction->name()),
-                  "</b>>;\nstyle=\"rounded,filled\";\n"
-                  "color=lightgrey;\n");
-        StrAppend(&graph_body,
-                  InstructionSequenceGraph(instruction->fused_instructions(),
-                                           show_addresses, show_layouts,
-                                           intercomputation_edges,
-                                           hlo_execution_profile, NodeFilter()),
-                  "}\n");
-        string fusion_edge = StrCat(
-            InstructionId(instruction->fused_expression_root()), " -> ",
-            InstructionId(instruction.get()),
-            "  [ style = \"dotted\", arrowsize=0.0, ltail=", cluster_name,
-            " ];\n");
-        intercomputation_edges->push_back(fusion_edge);
-      } else {
-        // If instruction has just one computation and it's trivial (e.g.
-        // "return param0 + param1"), put the trivial computation type (e.g.
-        // "add") into instruction's label.  Otherwise, add a dotted edge
-        // between the instruction and its subcomputations.
-        const auto& subcomputations = instruction->called_computations();
-
-        bool trivial_subcomputation = false;
-        if (subcomputations.size() == 1) {
-          optional<string> computation_type =
-              MatchTrivialComputation(subcomputations.front());
-          if (computation_type) {
-            trivial_subcomputation = true;
-            StrAppend(&label, "<br/>Subcomputation: <b>", *computation_type,
-                      "</b>");
-          }
-        }
-
-        if (!trivial_subcomputation) {
-          for (const HloComputation* computation :
-               instruction->called_computations()) {
-            string cluster_name =
-                StrCat("cluster_", ComputationId(computation));
-            string call_edge = Printf(
-                "%s -> %s [ style=dashed; ltail=%s ];\n",
-                InstructionId(computation->root_instruction()).c_str(),
-                InstructionId(instruction.get()).c_str(), cluster_name.c_str());
-            intercomputation_edges->push_back(call_edge);
-          }
-        }
-      }
-    }
-
-    // Inline constant operands into the node.
-    for (int64 i = 0; i < instruction->operand_count(); ++i) {
-      const HloInstruction* operand = instruction->operand(i);
-      if (operand->opcode() != HloOpcode::kConstant) {
-        continue;
-      }
-
-      StrAppend(&label, "<br/><b>operand ", i, "</b> = ");
-      if (ShapeUtil::IsEffectiveScalar(operand->shape())) {
-        auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
-            operand->shape(), /*linear_index=*/0);
-        StrAppend(&label, ShapeUtil::HumanString(operand->shape()), "{",
-                  operand->literal().GetAsString(elem_idx), "}");
-      } else {
-        if (tensorflow::StringPiece(operand->name()).starts_with("%constant")) {
-          StrAppend(&label, operand->name());
-        } else {
-          StrAppend(&label, "constant ", operand->name());
-        }
-      }
-    }
-
-    Appendf(&graph_body, "%s [label=<%s>, shape=%s, %s];\n",
-            InstructionId(instruction.get()).c_str(), label.c_str(),
-            shape.c_str(), NodeColorAttributes(color).c_str());
+ private:
+  // Returns the dot graph identifier for the given instruction.
+  string InstructionId(const HloInstruction* instruction) {
+    return StrCat(reinterpret_cast<uint64>(instruction));
   }
-  return graph_body;
+
+  // Returns the dot graph identifier for the given computation.
+  string SubcomputationId(const HloComputation* computation) {
+    return StrCat("cluster_", reinterpret_cast<uint64>(computation));
+  }
+
+  string Header();
+  string Footer();
+
+  // Maps HloComputations we should dump to their parent instruction in the
+  // outer computation.
+  std::unordered_map<const HloComputation*, const HloInstruction*>
+  SubcomputationsToDump();
+
+  string DumpSubcomputation(const HloComputation* subcomp,
+                            const HloInstruction* parent_instr);
+  string DumpComputation(const HloComputation* comp, const NodeFilter& filter);
+  string DumpInstruction(const HloInstruction* instr, const NodeFilter& filter);
+  ColorScheme GetInstructionColor(const HloInstruction* instr);
+  string GetInstructionNodeShape(const HloInstruction* instr);
+  string GetInstructionNodeLabel(const HloInstruction* instr);
+  string GetInstructionNodeExtraInfo(const HloInstruction* instr);
+  string GetInstructionIncomingEdges(const HloInstruction* instr,
+                                     const NodeFilter& filter);
+  string GetInstructionNodeInlinedConstants(const HloInstruction* instr);
+
+  // If instr has just one computation and it's trivial (e.g. "return param0 +
+  // param1"), returns a string you can put into the node's body that names the
+  // subcomputation, e.g. "Subcomputation: <b>add</b>".
+  string GetInstructionTrivialComputationStr(const HloInstruction* instr);
+
+  const HloComputation* computation_;  // never null
+  const string label_;                 // overall name for the graph
+  const bool show_addresses_;
+  const bool show_layouts_;
+  const HloExecutionProfile* profile_;  // may be null
+  const NodeFilter filter_;
+};
+
+string HloDotDumper::Dump() {
+  string g = Header();
+  for (const auto& kv : SubcomputationsToDump()) {
+    const HloComputation* subcomp = kv.first;
+    const HloInstruction* parent = kv.second;
+    StrAppend(&g, DumpSubcomputation(subcomp, parent));
+  }
+  StrAppend(&g, DumpComputation(computation_, filter_));
+  StrAppend(&g, Footer());
+  return g;
 }
 
-// DOT graphs accept a stylesheet as a URL.  So naturally, an inline stylesheet
-// is a data URI!
-//
-// We don't perform any escaping on this string, so be careful not to use double
-// quotes inside.
-static const char* dot_stylesheet = R"(
-data:text/css,
-@import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
-svg text {
-  font-family: 'Roboto';
-  font-size: 12px;
-}
+string HloDotDumper::Header() {
+  // DOT graphs accept a stylesheet as a URI.  So naturally, an inline
+  // stylesheet is a data URI!
+  const char* fmt = R"(digraph G {
+rankdir = TB;
+compound = true;
+label = <<b>%s</b>>;
+labelloc = t;
+stylesheet="
+  data:text/css,
+  @import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
+  svg text {
+    font-family: 'Roboto';
+    font-size: 12px;
+  }
+"
+
 )";
 
-string ComputationToDotGraph(const HloComputation& computation,
-                             const string& label, bool show_addresses,
-                             bool show_layouts,
-                             const HloExecutionProfile* hlo_execution_profile,
-                             const NodeFilter& filter) {
-  string graph_label = StrCat(label, "<br/>", computation.name());
-  if (hlo_execution_profile != nullptr) {
-    auto cycles = hlo_execution_profile->total_cycles_executed(computation);
+  string graph_label = StrCat(label_, "<br/>", computation_->name());
+  if (profile_ != nullptr) {
+    auto cycles = profile_->total_cycles_executed(*computation_);
     Appendf(&graph_label, "<br/>total cycles = %lld (%s)", cycles,
-            tensorflow::strings::HumanReadableNum(cycles).c_str());
+            tensorflow::strings::HumanReadableNum(cycles));
   }
-  string graph = Printf(
-      R"(digraph G {
-rankdir=TB;
-compound=true;
-label=<<b>%s</b>>;
-labelloc=t;
-stylesheet="%s"
-)",
-      graph_label.c_str(), dot_stylesheet);
+  return Printf(fmt, graph_label);
+}
 
+string HloDotDumper::Footer() { return "}\n"; }
+
+std::unordered_map<const HloComputation*, const HloInstruction*>
+HloDotDumper::SubcomputationsToDump() {
   // Dump the subcomputations of each instruction that's shown and doesn't have
   // its operands omitted.  If an instruction has just one subcomputation and
   // it's trivial, omit it: We'll display that subcomputation inlined into the
   // instruction's node when we draw it.
-  std::unordered_set<const HloComputation*> computations_to_dump;
-  for (const auto& instr : computation.instructions()) {
-    if (!filter.Show(instr.get()) || filter.OmitOperands(instr.get())) {
+  std::unordered_map<const HloComputation*, const HloInstruction*> to_dump;
+  for (const auto& instr : computation_->instructions()) {
+    if (!filter_.Show(instr.get()) ||
+        filter_.SomeOrAllOperandsOmitted(instr.get())) {
       continue;
     }
     if (instr->opcode() == HloOpcode::kFusion) {
-      computations_to_dump.insert(instr->fused_instructions_computation());
+      to_dump[instr->fused_instructions_computation()] = instr.get();
     }
 
     const auto& subcomputations = instr->called_computations();
     if (subcomputations.size() != 1 ||
         !MatchTrivialComputation(subcomputations.front())) {
-      for (const HloComputation* computation : instr->called_computations()) {
-        computations_to_dump.insert(computation);
+      for (const HloComputation* comp : instr->called_computations()) {
+        to_dump[comp] = instr.get();
       }
     }
   }
+  return to_dump;
+}
 
-  // Emit embedded computations as subgraph clusters.
-  std::vector<string> intercomputation_edges;
-  for (const HloComputation* embedded :
-       computation.MakeEmbeddedComputationsList()) {
-    if (!computations_to_dump.count(embedded)) {
+string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp,
+                                        const HloInstruction* parent_instr) {
+  const char* computation_fmt = R"(subgraph %s {
+style = "%s";
+color = "%s";
+label = <%s>;
+labelloc = t;
+%s
+}  // %s
+
+)";
+
+  const char* edge_fmt = R"(%s -> %s [ltail="%s", %s];)";
+
+  string id = SubcomputationId(subcomp);
+
+  string subcomp_label, style, edge_or_bg_color, edge_attrs;
+  if (parent_instr->opcode() == HloOpcode::kFusion) {
+    subcomp_label = Printf("Fused expression for <b>%s</b><br/>%s",
+                           HtmlLikeStringSanitize(parent_instr->name()),
+                           HtmlLikeStringSanitize(parent_instr->ToCategory()));
+    style = "rounded,filled";
+    edge_or_bg_color = "lightgray";
+    edge_attrs = "style=dotted, arrowsize=0";
+  } else {
+    subcomp_label = Printf("Subcomputation for <b>%s</b><br/>%s",
+                           HtmlLikeStringSanitize(parent_instr->name()),
+                           HtmlLikeStringSanitize(subcomp->name()));
+    style = "rounded";
+    edge_or_bg_color = "black";
+    edge_attrs = "style=dashed";
+  }
+
+  // Pass an empty filter to DumpComputation -- we always dump the entirety of a
+  // subcomputation.
+  string comp_body = DumpComputation(subcomp, NodeFilter());
+  string computation = Printf(computation_fmt, id, style, edge_or_bg_color,
+                              subcomp_label, comp_body, id);
+  string edge = Printf(edge_fmt, InstructionId(subcomp->root_instruction()),
+                       InstructionId(parent_instr), SubcomputationId(subcomp),
+                       edge_attrs);
+  return StrCat(computation, "\n", edge, "\n");
+}
+
+string HloDotDumper::DumpComputation(const HloComputation* comp,
+                                     const NodeFilter& filter) {
+  string g;
+  for (const auto& instr : comp->instructions()) {
+    if (!filter.Show(instr.get())) {
       continue;
     }
-    // Don't pass our filter down into the subcomputation -- always render the
-    // whole thing.
-    string graph_body = InstructionSequenceGraph(
-        embedded->instructions(), show_addresses, show_layouts,
-        &intercomputation_edges, hlo_execution_profile, NodeFilter());
-    Appendf(&graph,
-            "subgraph cluster_%s "
-            "{\nstyle=rounded;label=<<b>%s</b>>;labelloc=t;\n%s}\n",
-            ComputationId(embedded).c_str(), embedded->name().c_str(),
-            graph_body.c_str());
+    StrAppend(&g, DumpInstruction(instr.get(), filter));
   }
-  StrAppend(&graph,
-            InstructionSequenceGraph(computation.instructions(), show_addresses,
-                                     show_layouts, &intercomputation_edges,
-                                     hlo_execution_profile, filter));
+  return g;
+}
 
-  // Edges between computations (subgraph clusters) must be emitted last for the
-  // graph to be rendered properly for some reason.
-  StrAppend(&graph, Join(intercomputation_edges, "\n"), "}\n");
+string HloDotDumper::DumpInstruction(const HloInstruction* instr,
+                                     const NodeFilter& filter) {
+  // We don't display constants as separate nodes; they're merged into their
+  // users.
+  if (instr->opcode() == HloOpcode::kConstant) {
+    return "";
+  }
 
-  return graph;
+  ColorScheme color = GetInstructionColor(instr);
+  string node_shape = GetInstructionNodeShape(instr);
+  string node_label = GetInstructionNodeLabel(instr);
+  string extra_info = GetInstructionNodeExtraInfo(instr);
+  string inlined_constants = GetInstructionNodeInlinedConstants(instr);
+  string trivial_subcomputation = GetInstructionTrivialComputationStr(instr);
+
+  string in_edges = GetInstructionIncomingEdges(instr, filter);
+
+  // Override the node's styling if it should be (de-)emphasized.
+  if (filter.SomeOrAllOperandsOmitted(instr)) {
+    color = kDashedBorder;
+  }
+  if (filter.Highlight(instr)) {
+    node_shape = "diamond";
+    color = kDarkRed;
+  }
+
+  // Build the text that will be displayed inside the node.
+  string node_body = node_label;
+  for (const string& s :
+       {trivial_subcomputation, extra_info, inlined_constants}) {
+    if (!s.empty()) {
+      StrAppend(&node_body, "<br/>", s);
+    }
+  }
+
+  string node = Printf("%s [label=<%s>, shape=%s, %s];", InstructionId(instr),
+                       node_body, node_shape, NodeColorAttributes(color));
+  return StrCat(node, "\n", in_edges);
+}
+
+string HloDotDumper::GetInstructionNodeInlinedConstants(
+    const HloInstruction* instr) {
+  std::vector<string> lines;
+  for (int64 i = 0; i < instr->operand_count(); ++i) {
+    const HloInstruction* operand = instr->operand(i);
+    if (operand->opcode() != HloOpcode::kConstant) {
+      continue;
+    }
+
+    string line = Printf("<b>operand %lld</b> = ", i);
+    if (ShapeUtil::IsEffectiveScalar(operand->shape())) {
+      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
+          operand->shape(), /*linear_index=*/0);
+      Appendf(&line, "%s{%s}", ShapeUtil::HumanString(operand->shape()),
+              operand->literal().GetAsString(elem_idx));
+    } else {
+      if (tensorflow::StringPiece(operand->name()).starts_with("%constant")) {
+        StrAppend(&line, operand->name());
+      } else {
+        StrAppend(&line, "constant ", operand->name());
+      }
+    }
+    lines.push_back(line);
+  }
+  return Join(lines, "<br/>");
+}
+
+ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
+  // Pick different colors or shapes for instructions which are particularly
+  // expensive (eg, dot) and those which are unusual in some way or unique
+  // (eg, parameter).
+  switch (instr->opcode()) {
+    case HloOpcode::kAbs:
+    case HloOpcode::kAdd:
+    case HloOpcode::kCeil:
+    case HloOpcode::kClamp:
+    case HloOpcode::kConvert:
+    case HloOpcode::kCos:
+    case HloOpcode::kDivide:
+    case HloOpcode::kEq:
+    case HloOpcode::kExp:
+    case HloOpcode::kFloor:
+    case HloOpcode::kGe:
+    case HloOpcode::kGt:
+    case HloOpcode::kIndex:
+    case HloOpcode::kIsFinite:
+    case HloOpcode::kLe:
+    case HloOpcode::kLog:
+    case HloOpcode::kLogicalAnd:
+    case HloOpcode::kLogicalNot:
+    case HloOpcode::kLogicalOr:
+    case HloOpcode::kLt:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kNe:
+    case HloOpcode::kNegate:
+    case HloOpcode::kPower:
+    case HloOpcode::kRemainder:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSign:
+    case HloOpcode::kSin:
+    case HloOpcode::kSlice:
+    case HloOpcode::kSort:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kTanh:
+    case HloOpcode::kRng:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kTranspose:
+      return kYellow;
+    case HloOpcode::kBitcast:
+    case HloOpcode::kTuple:
+    case HloOpcode::kTrace:
+    case HloOpcode::kGetTupleElement:
+      return kWhite;
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kCopy:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kPad:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kUpdate:
+      return kGreen;
+    case HloOpcode::kConvolution:
+    case HloOpcode::kDot:
+      return kDarkBlue;
+    case HloOpcode::kReducePrecision:
+      return kRed;
+    case HloOpcode::kParameter:
+      return kOrange;
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kReduce:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kReduceWindow:
+      return kPurple;
+    case HloOpcode::kMap:
+    case HloOpcode::kFusion:
+      return kGray;
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kCrossReplicaSum:
+      return kBrown;
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kWhile:
+    case HloOpcode::kCall:
+      return kDarkGreen;
+    case HloOpcode::kConstant:
+      LOG(FATAL) << "Constants don't get their own nodes in the graph.";
+  }
+}
+
+string HloDotDumper::GetInstructionNodeShape(const HloInstruction* instr) {
+  // Give while loops a different shape so they're easier to pick out.
+  switch (instr->opcode()) {
+    case HloOpcode::kWhile:
+      return "ellipse";
+    default:
+      return "rect";
+  }
+}
+
+string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
+  // If we have a parameter, put the param number in the name.
+  if (instr->opcode() == HloOpcode::kParameter) {
+    return Printf("<b>Parameter %lld</b><br/>%s", instr->parameter_number(),
+                  HtmlLikeStringSanitize(instr->name()));
+  }
+
+  // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
+  // an add instruction.  In this case we render just the name.
+  if (tensorflow::StringPiece(instr->name())
+          .starts_with(StrCat("%", HloOpcodeString(instr->opcode())))) {
+    return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
+  }
+
+  // If the name does not contain the opcode, render both.
+  return Printf("<b>%s</b><br/>%s",
+                HtmlLikeStringSanitize(instr->ExtendedOpcodeStr()),
+                HtmlLikeStringSanitize(instr->name()));
+}
+
+string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
+  string opcode_specific_info = [&]() -> string {
+    switch (instr->opcode()) {
+      case HloOpcode::kRng:
+        return RandomDistribution_Name(instr->random_distribution());
+      case HloOpcode::kConvolution:
+        return StrCat(
+            HtmlLikeStringSanitize(
+                instr->ConvolutionDimensionNumbersToString()),
+            "<br/>",
+            HtmlLikeStringSanitize(window_util::ToString(instr->window())));
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kTranspose:
+      case HloOpcode::kReduce:
+        return Printf("dims={%s}", Join(instr->dimensions(), ","));
+      case HloOpcode::kGetTupleElement:
+        return Printf("index=%lld", instr->tuple_index());
+      case HloOpcode::kBatchNormTraining:
+      case HloOpcode::kBatchNormGrad:
+        return Printf("feature_index=%lld", instr->feature_index());
+      case HloOpcode::kCustomCall:
+        return Printf("custom_call_target=%s", instr->custom_call_target());
+      default:
+        return "";
+    }
+  }();
+
+  std::vector<string> lines;
+  if (!opcode_specific_info.empty()) {
+    lines.push_back(opcode_specific_info);
+  }
+  lines.push_back(ShapeUtil::HumanString(instr->shape()));
+  if (show_addresses_) {
+    lines.push_back(Printf("[%p]", instr));
+  }
+  if (show_layouts_ && LayoutUtil::HasLayout(instr->shape())) {
+    string layout_str;
+    if (ShapeUtil::IsTuple(instr->shape())) {
+      // For tuples, emit the full shape because the layout of a tuple is not
+      // represented in a single Layout field.
+      layout_str = ShapeUtil::HumanStringWithLayout(instr->shape());
+    } else {
+      layout_str = Join(instr->shape().layout().minor_to_major(), ",");
+    }
+    lines.push_back(Printf("layout={%s}", layout_str));
+  }
+  if (profile_ != nullptr) {
+    double hlo_cycles_executed = profile_->GetProfileResult(*instr);
+    double total_cycles_executed =
+        profile_->total_cycles_executed(*instr->parent());
+    if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
+      lines.push_back(
+          Printf("%% of cycles executed=%.2f",
+                 100 * hlo_cycles_executed / total_cycles_executed));
+    }
+  }
+  return Join(lines, "<br/>");
+}
+
+string HloDotDumper::GetInstructionIncomingEdges(const HloInstruction* instr,
+                                                 const NodeFilter& filter) {
+  std::vector<string> edges;
+  for (int64 i = 0; i < instr->operand_count(); ++i) {
+    const HloInstruction* operand = instr->operand(i);
+    if (!filter.Show(operand) || operand->opcode() == HloOpcode::kConstant) {
+      continue;
+    }
+    string edge =
+        Printf("%s -> %s", InstructionId(operand), InstructionId(instr));
+    if (instr->operand_count() > 1) {
+      Appendf(&edge, R"( [headlabel="%lld",labeldistance=2])", i);
+    }
+    StrAppend(&edge, ";");
+    edges.push_back(edge);
+  }
+  return Join(edges, "\n");
+}
+
+string HloDotDumper::GetInstructionTrivialComputationStr(
+    const HloInstruction* instr) {
+  // called_computations() on a fusion node "inherits" any called computations
+  // of the fused root, which isn't what we want.  Just ignore fusion nodes
+  // here; they're handled separately.
+  if (instr->opcode() == HloOpcode::kFusion) {
+    return "";
+  }
+
+  const auto& subcomps = instr->called_computations();
+  if (subcomps.size() != 1) {
+    return "";
+  }
+  optional<string> computation_type = MatchTrivialComputation(subcomps.front());
+  if (!computation_type) {
+    return "";
+  }
+  return Printf("Subcomputation: <b>%s</b>",
+                HtmlLikeStringSanitize(*computation_type));
 }
 
 tensorflow::mutex& RendererMutex() {
@@ -886,10 +927,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
     graph_url = FileGraphRenderer().RenderGraph(
         graph, GraphRendererInterface::TF_GRAPHDEF, debug_options);
   } else {
-    graph = ComputationToDotGraph(computation, label,
-                                  debug_options.xla_hlo_graph_addresses(),
-                                  debug_options.xla_hlo_graph_layout(),
-                                  hlo_execution_profile, NodeFilter());
+    graph =
+        HloDotDumper(&computation, label,
+                     /*show_addresses=*/debug_options.xla_hlo_graph_addresses(),
+                     /*show_layouts=*/debug_options.xla_hlo_graph_layout(),
+                     hlo_execution_profile, NodeFilter())
+            .Dump();
     graph_url = GetGraphRenderer()->RenderGraph(
         graph, GraphRendererInterface::DOT_GRAPH, debug_options);
   }
@@ -903,11 +946,12 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius) {
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
   NodeFilter filter = MakeNodeFilter(&node, radius);
-  string graph = ComputationToDotGraph(
-      *node.parent(), label,
-      /*show_addresses=*/debug_options.xla_hlo_graph_addresses(),
-      /*show_layouts=*/debug_options.xla_hlo_graph_layout(),
-      /*hlo_execution_profile=*/nullptr, filter);
+  string graph =
+      HloDotDumper(node.parent(), label,
+                   /*show_addresses=*/debug_options.xla_hlo_graph_addresses(),
+                   /*show_layouts=*/debug_options.xla_hlo_graph_layout(),
+                   /*profile=*/nullptr, filter)
+          .Dump();
   return GetGraphRenderer()->RenderGraph(
       graph, GraphRendererInterface::DOT_GRAPH, debug_options);
 }

From e07ddbe64c944f1ea69f23687803d70d450f2ac3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 05:19:15 -0700
Subject: [PATCH 06/56] Use correct order of arguments in call of
 valid_bitcast_callback_.

There are platforms where bitcasts are not symmetric. I.e. there are shapes A and B so that A->B is a bitcast, but B->A not. So we have to consider the correct order when calling valid_bitcast_callback_.

PiperOrigin-RevId: 163058665
---
 .../compiler/xla/service/algebraic_simplifier.cc    |  6 +++---
 .../compiler/xla/service/algebraic_simplifier.h     | 13 +++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index b351861425d..4837402c15b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1488,9 +1488,9 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
   // invalid.
-  if (!valid_bitcast_callback_(lhs->shape(), input_shape) ||
-      !valid_bitcast_callback_(rhs->shape(), new_filter_shape) ||
-      !valid_bitcast_callback_(dot_output_shape, convolution_shape)) {
+  if (!valid_bitcast_callback_(input_shape, lhs->shape()) ||
+      !valid_bitcast_callback_(new_filter_shape, rhs->shape()) ||
+      !valid_bitcast_callback_(convolution_shape, dot_output_shape)) {
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index f8919f0caad..4295a3227a8 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -26,12 +26,13 @@ namespace xla {
 // A pass which performs AlgebraicSimplications.
 class AlgebraicSimplifier : public HloPassInterface {
  public:
-  // Given two shapes, determines if it is valid to bitcast between them after
-  // considering platform dependent effects on layout like alignment
-  // restrictions.
-  // Precondition: the two shapes have layouts, the same number of
-  // elements and ShapeUtil::ReshapeIsBitcast returns true.
-  using ValidBitcastCallback = std::function<bool(const Shape&, const Shape&)>;
+  // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
+  // bitcast from 'from_shape' to 'to_shape' after considering platform
+  // dependent effects on layout like alignment restrictions. Precondition: the
+  // two shapes have layouts, the same number of elements and
+  // ShapeUtil::ReshapeIsBitcast returns true.
+  using ValidBitcastCallback =
+      std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
 
   // If is_layout_sensitive is true, then the simplifier preserves layout during
   // transformation. Otherwise, layout is ignored. If valid_bitcast_callback

From 18fef3435a3512e26d273bc95af24f08d59792f4 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 25 Jul 2017 06:58:00 -0700
Subject: [PATCH 07/56] Two improvements to pip.sh

1. Distinguish between passed and skipped tests.
2. Allow skipping the smoke test of tensorflow install in clean virtualenv with NO_TEST_ON_INSTALL=1

PiperOrigin-RevId: 163065599
---
 .../tools/ci_build/builds/builds_common.sh    |  1 +
 tensorflow/tools/ci_build/builds/pip.sh       | 27 ++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/builds/builds_common.sh b/tensorflow/tools/ci_build/builds/builds_common.sh
index 9323a96e74c..e3b58d038a7 100644
--- a/tensorflow/tools/ci_build/builds/builds_common.sh
+++ b/tensorflow/tools/ci_build/builds/builds_common.sh
@@ -17,6 +17,7 @@
 # Common Bash functions used by build scripts
 
 COLOR_NC='\033[0m'
+COLOR_LIGHT_GRAY='\033[0;37m'
 COLOR_GREEN='\033[0;32m'
 COLOR_RED='\033[0;31m'
 
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index db011a6badd..112dab3a733 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -73,6 +73,9 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds_common.sh"
 
 
+SKIP_RETURN_CODE=112
+
+
 # Get the command line arguments
 CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
 shift
@@ -310,6 +313,13 @@ create_activate_virtualenv_and_install_tensorflow() {
 # Smoke test of tensorflow install in clean virtualenv
 ################################################################################
 do_clean_virtualenv_smoke_test() {
+  if [[ -n "${NO_TEST_ON_INSTALL}" ]] &&
+       [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
+    echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
+    echo "  Skipping smoke test of tensorflow install in clean virtualenv"
+    return ${SKIP_RETURN_CODE}
+  fi
+
   CLEAN_VENV_DIR="${PIP_TEST_ROOT}/venv_clean"
   create_activate_virtualenv_and_install_tensorflow --clean \
     "${CLEAN_VENV_DIR}" "${WHL_PATH}"
@@ -361,6 +371,7 @@ do_virtualenv_pip_test() {
      [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
     echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
     echo "  Skipping ALL Python unit tests on install"
+    return ${SKIP_RETURN_CODE}
   else
     # Call run_pip_tests.sh to perform test-on-install
     "${SCRIPT_DIR}/run_pip_tests.sh" --virtualenv ${GPU_FLAG} ${MAC_FLAG}
@@ -379,6 +390,7 @@ do_virtualenv_oss_serial_pip_test() {
      [[ "${NO_TEST_ON_INSTALL}" != "0" ]]; then
     echo "NO_TEST_ON_INSTALL=${NO_TEST_ON_INSTALL}:"
     echo "  Skipping Python unit tests on install tagged with oss_serial"
+    return ${SKIP_RETURN_CODE}
   else
     # Call run_pip_tests.sh to perform test-on-install
     "${SCRIPT_DIR}/run_pip_tests.sh" \
@@ -402,6 +414,7 @@ do_test_user_ops() {
     fi
   else
     echo "Skipping user-op test-on-install due to DO_TEST_USER_OPS = ${DO_TEST_USER_OPS}"
+    return ${SKIP_RETURN_CODE}
   fi
 }
 
@@ -424,6 +437,7 @@ do_test_tfdbg_binaries() {
     popd
   else
     echo "Skipping test of tfdbg binaries due to DO_TEST_TFDBG_BINARIES = ${DO_TEST_TFDBG_BINARIES}"
+    return ${SKIP_RETURN_CODE}
   fi
 }
 
@@ -439,6 +453,7 @@ do_test_tutorials() {
     fi
   else
     echo "Skipping tutorial tests-on-install due to DO_TEST_TUTORIALS = ${DO_TEST_TUTORIALS}"
+    return ${SKIP_RETURN_CODE}
   fi
 }
 
@@ -455,6 +470,7 @@ do_ffmpeg_integration_test() {
     fi
   else
     echo "Skipping ffmpeg integration due to DO_INTEGRATION_TESTS = ${DO_INTEGRATION_TESTS}"
+    return ${SKIP_RETURN_CODE}
   fi
 }
 
@@ -468,6 +484,7 @@ PIP_TASKS_DESC=("Smoke test of pip install in clean virtualenv" "PIP tests in vi
 COUNTER=0
 FAIL_COUNTER=0
 PASS_COUNTER=0
+SKIP_COUNTER=0
 while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
   INDEX=COUNTER
   ((INDEX++))
@@ -480,7 +497,9 @@ while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
   ${PIP_TASKS[COUNTER]}
   RESULT=$?
 
-  if [[ ${RESULT} != "0" ]]; then
+  if [[ ${RESULT} == ${SKIP_RETURN_CODE} ]]; then
+    ((SKIP_COUNTER++))
+  elif [[ ${RESULT} != "0" ]]; then
     ((FAIL_COUNTER++))
   else
     ((PASS_COUNTER++))
@@ -503,7 +522,9 @@ while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
   ((INDEX++))
 
   echo "${INDEX}. ${PIP_TASKS[COUNTER]}: ${PIP_TASKS_DESC[COUNTER]}"
-  if [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then
+  if [[ ${STEP_EXIT_CODES[COUNTER]} == ${SKIP_RETURN_CODE} ]]; then
+    printf "  ${COLOR_LIGHT_GRAY}SKIP${COLOR_NC}\n"
+  elif [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then
     printf "  ${COLOR_GREEN}PASS${COLOR_NC}\n"
   else
     printf "  ${COLOR_RED}FAIL${COLOR_NC}\n"
@@ -513,7 +534,7 @@ while [[ ${COUNTER} -lt "${#PIP_TASKS[@]}" ]]; do
 done
 
 echo
-echo "${FAIL_COUNTER} failed; ${PASS_COUNTER} passed."
+echo "${SKIP_COUNTER} skipped; ${FAIL_COUNTER} failed; ${PASS_COUNTER} passed."
 
 echo
 if [[ ${FAIL_COUNTER} == "0" ]]; then

From 4393a1e0299c149e61cd467bc9a0b20daa3cae78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 08:22:57 -0700
Subject: [PATCH 08/56] [XLA] Update StatusOr implementation to use more
 nuanced type traits.

Previously we would evaluate the is_copy_constructible trait before template
parameters were fully defined; e.g. StatusOr<ThingIAmDefiningRightNow>,
which could lead to surprising effects.

Also, previously it was not possible to provide an error status to a
StatusOr<T> where T was not default-constructible.

PiperOrigin-RevId: 163073057
---
 tensorflow/compiler/xla/BUILD                |   5 +-
 tensorflow/compiler/xla/statusor.cc          |  22 +-
 tensorflow/compiler/xla/statusor.h           | 333 ++++++++++---------
 tensorflow/compiler/xla/statusor_internals.h | 245 ++++++++++++++
 tensorflow/compiler/xla/statusor_test.cc     |  22 +-
 5 files changed, 451 insertions(+), 176 deletions(-)
 create mode 100644 tensorflow/compiler/xla/statusor_internals.h

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 5eef45b11d4..e0a03a78f1d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -132,7 +132,10 @@ cc_library(
 cc_library(
     name = "statusor",
     srcs = ["statusor.cc"],
-    hdrs = ["statusor.h"],
+    hdrs = [
+        "statusor.h",
+        "statusor_internals.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":status",
diff --git a/tensorflow/compiler/xla/statusor.cc b/tensorflow/compiler/xla/statusor.cc
index 36f08fc99f4..72ab67ff810 100644
--- a/tensorflow/compiler/xla/statusor.cc
+++ b/tensorflow/compiler/xla/statusor.cc
@@ -19,28 +19,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
-namespace internal {
+namespace internal_statusor {
 
-Status StatusOrHelper::HandleInvalidStatusCtorArg() {
+void Helper::HandleInvalidStatusCtorArg(Status* status) {
   const char* kMessage =
-      "Status::OK is not a valid constructor argument to StatusOr<T>";
+      "An OK status is not a valid constructor argument to StatusOr<T>";
   LOG(ERROR) << kMessage;
-  // In optimized builds, we will fall back to tensorflow::error::INTERNAL.
-  return Status(tensorflow::error::INTERNAL, kMessage);
+  // Fall back to tensorflow::error::INTERNAL.
+  *status = ::tensorflow::errors::Internal(kMessage);
 }
 
-Status StatusOrHelper::HandleNullObjectCtorArg() {
-  const char* kMessage =
-      "NULL is not a valid constructor argument to StatusOr<T*>";
-  LOG(ERROR) << kMessage;
-  // In optimized builds, we will fall back to tensorflow::error::INTERNAL.
-  return Status(tensorflow::error::INTERNAL, kMessage);
-}
-
-void StatusOrHelper::Crash(const Status& status) {
+void Helper::Crash(const Status& status) {
   LOG(FATAL) << "Attempting to fetch value instead of handling error "
              << status;
 }
 
-}  // namespace internal
+}  // namespace internal_statusor
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index d8cd736238c..92bcfa0f44d 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -72,216 +72,233 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_STATUSOR_H_
 
 #include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor_internals.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
 
 #if defined(__clang__)
 // Only clang supports warn_unused_result as a type annotation.
-template <typename T, bool CopyConstructible>
+template <typename T>
 class TF_MUST_USE_RESULT StatusOr;
 #endif
 
-template <typename T,
-          bool CopyConstructible = std::is_copy_constructible<T>::value>
-class StatusOr {
-  template <typename U, bool UC>
+template <typename T>
+class StatusOr : private internal_statusor::StatusOrData<T>,
+                 private internal_statusor::TraitsBase<
+                     std::is_copy_constructible<T>::value,
+                     std::is_move_constructible<T>::value> {
+  template <typename U>
   friend class StatusOr;
 
+  typedef internal_statusor::StatusOrData<T> Base;
+
  public:
   typedef T element_type;
 
-  // Construct a new StatusOr with Status::UNKNOWN status
-  StatusOr();
+  // Constructs a new StatusOr with Status::UNKNOWN status.  This is marked
+  // 'explicit' to try to catch cases like 'return {};', where people think
+  // StatusOr<std::vector<int>> will be initialized with an empty vector,
+  // instead of a Status::UNKNOWN status.
+  explicit StatusOr();
 
-  // Construct a new StatusOr with the given non-ok status. After calling
+  // StatusOr<T> will be copy constructuble/assignable if T is copy
+  // constructible.
+  StatusOr(const StatusOr&) = default;
+  StatusOr& operator=(const StatusOr&) = default;
+
+  // StatusOr<T> will be move constructuble/assignable if T is move
+  // constructible.
+  StatusOr(StatusOr&&) = default;
+  StatusOr& operator=(StatusOr&&) = default;
+
+  // Conversion copy/move constructor, T must be convertible from U.
+  // TODO(b/62186717): These should not participate in overload resolution if U
+  // is not convertible to T.
+  template <typename U>
+  StatusOr(const StatusOr<U>& other);
+  template <typename U>
+  StatusOr(StatusOr<U>&& other);
+
+  // Conversion copy/move assignment operator, T must be convertible from U.
+  template <typename U>
+  StatusOr& operator=(const StatusOr<U>& other);
+  template <typename U>
+  StatusOr& operator=(StatusOr<U>&& other);
+
+  // Constructs a new StatusOr with the given value. After calling this
+  // constructor, calls to ValueOrDie() will succeed, and calls to status() will
+  // return OK.
+  //
+  // NOTE: Not explicit - we want to use StatusOr<T> as a return type
+  // so it is convenient and sensible to be able to do 'return T()'
+  // when the return type is StatusOr<T>.
+  //
+  // REQUIRES: T is copy constructible.
+  StatusOr(const T& value);
+
+  // Constructs a new StatusOr with the given non-ok status. After calling
   // this constructor, calls to ValueOrDie() will CHECK-fail.
   //
   // NOTE: Not explicit - we want to use StatusOr<T> as a return
   // value, so it is convenient and sensible to be able to do 'return
   // Status()' when the return type is StatusOr<T>.
   //
-  // REQUIRES: status != Status::OK. This requirement is DCHECKed.
-  // In optimized builds, passing Status::OK here will have the effect
+  // REQUIRES: !status.ok(). This requirement is DCHECKed.
+  // In optimized builds, passing Status::OK() here will have the effect
   // of passing tensorflow::error::INTERNAL as a fallback.
-  StatusOr(Status status);              // NOLINT
+  StatusOr(const Status& status);
+  StatusOr& operator=(const Status& status);
 
-  // Construct a new StatusOr with the given value. If T is a plain pointer,
-  // value must not be NULL. After calling this constructor, calls to
-  // ValueOrDie() will succeed, and calls to status() will return OK.
+  // TODO(b/62186997): Add operator=(T) overloads.
+
+  // Similar to the `const T&` overload.
   //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return type
-  // so it is convenient and sensible to be able to do 'return T()'
-  // when the return type is StatusOr<T>.
-  //
-  // REQUIRES: if T is a plain pointer, value != NULL. This requirement is
-  // DCHECKed. In optimized builds, passing a NULL pointer here will have
-  // the effect of passing tensorflow::error::INTERNAL as a fallback.
-  StatusOr(const T& value);  // NOLINT
+  // REQUIRES: T is move constructible.
+  StatusOr(T&& value);
 
-  // Copy constructor.
-  StatusOr(const StatusOr& other) = default;
-
-  // Conversion copy constructor, T must be copy constructible from U
-  template <typename U>
-  StatusOr(const StatusOr<U>& other);
-
-  // Assignment operator.
-  StatusOr& operator=(const StatusOr& other) = default;
-
-  // Conversion assignment operator, T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(const StatusOr<U>& other);
-
-  // Move constructor and move-assignment operator.
-  StatusOr(StatusOr&& other) = default;
-  StatusOr& operator=(StatusOr&& other) = default;
-
-  // Rvalue-reference overloads of the other constructors and assignment
-  // operators, to support move-only types and avoid unnecessary copying.
-  //
-  // Implementation note: we could avoid all these rvalue-reference overloads
-  // if the existing lvalue-reference overloads took their arguments by value
-  // instead. I think this would also let us omit the conversion assignment
-  // operator altogether, since we'd get the same functionality for free
-  // from the implicit conversion constructor and ordinary assignment.
-  // However, this could result in extra copy operations unless we use
-  // std::move to avoid them, and we can't use std::move because this code
-  // needs to be portable to C++03.
-  StatusOr(T&& value);  // NOLINT
-  template <typename U>
-  StatusOr(StatusOr<U>&& other);
-
-  // Returns a reference to our status. If this contains a T, then
-  // returns Status::OK.
-  const Status& status() const { return status_; }
+  // RValue versions of the operations declared above.
+  StatusOr(Status&& status);
+  StatusOr& operator=(Status&& status);
 
   // Returns this->status().ok()
-  bool ok() const { return status_.ok(); }
+  bool ok() const { return this->status_.ok(); }
+
+  // Returns a reference to our status. If this contains a T, then
+  // returns Status::OK().
+  const Status& status() const &;
+  Status status() &&;
 
   // Returns a reference to our current value, or CHECK-fails if !this->ok().
-  const T& ValueOrDie() const;
-  T& ValueOrDie();
+  //
+  // Note: for value types that are cheap to copy, prefer simple code:
+  //
+  //   T value = statusor.ValueOrDie();
+  //
+  // Otherwise, if the value type is expensive to copy, but can be left
+  // in the StatusOr, simply assign to a reference:
+  //
+  //   T& value = statusor.ValueOrDie();  // or `const T&`
+  //
+  // Otherwise, if the value type supports an efficient move, it can be
+  // used as follows:
+  //
+  //   T value = std::move(statusor).ValueOrDie();
+  //
+  // The std::move on statusor instead of on the whole expression enables
+  // warnings about possible uses of the statusor object after the move.
+  // C++ style guide waiver for ref-qualified overloads granted in cl/143176389
+  // See go/ref-qualifiers for more details on such overloads.
+  const T& ValueOrDie() const &;
+  T& ValueOrDie() &;
+  const T&& ValueOrDie() const &&;
+  T&& ValueOrDie() &&;
 
-  // Moves our current value out of this object and returns it, or CHECK-fails
-  // if !this->ok().
-  // Use of this method is discouraged; prefer std::move(statusor.ValueOrDie())
-  // instead.
   T ConsumeValueOrDie() { return std::move(ValueOrDie()); }
 
- private:
-  Status status_;
-  T value_;
-};
-
-// Partial specialization for when T is not copy-constructible. This uses all
-// methods from the core implementation, but removes copy assignment and copy
-// construction.
-template <typename T>
-class StatusOr<T, false> : public StatusOr<T, true> {
- public:
-  // Remove copies.
-  StatusOr(const StatusOr& other) = delete;
-  StatusOr& operator=(const StatusOr& other) = delete;
-  template <typename U>
-  StatusOr(const StatusOr<U>& other) = delete;
-  StatusOr(const T& value) = delete;
-
-  // Use the superclass version for other constructors and operators.
-  StatusOr() = default;
-  StatusOr(StatusOr&& other) = default;
-  StatusOr& operator=(StatusOr&& other) = default;
-  StatusOr(T&& value)  // NOLINT
-      : StatusOr<T, true>::StatusOr(std::move(value)) {}
-  StatusOr(Status status)  // NOLINT
-      : StatusOr<T, true>::StatusOr(std::move(status)) {}
-  template <typename U>
-  StatusOr(StatusOr<U>&& other)  // NOLINT
-      : StatusOr<T, true>::StatusOr(std::move(other)) {}
+  // Ignores any errors. This method does nothing except potentially suppress
+  // complaints from any tools that are checking that errors are not dropped on
+  // the floor.
+  void IgnoreError() const;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
 // Implementation details for StatusOr<T>
 
-namespace internal {
-
-class StatusOrHelper {
- public:
-  // Move type-agnostic error handling to the .cc.
-  static Status HandleInvalidStatusCtorArg();
-  static Status HandleNullObjectCtorArg();
-  static void Crash(const Status& status);
-
-  // Customized behavior for StatusOr<T> vs. StatusOr<T*>
-  template <typename T>
-  struct Specialize;
-};
+template <typename T>
+StatusOr<T>::StatusOr() : Base(Status(tensorflow::error::UNKNOWN, "")) {}
 
 template <typename T>
-struct StatusOrHelper::Specialize {
-  // For non-pointer T, a reference can never be NULL.
-  static inline bool IsValueNull(const T& t) { return false; }
-};
+StatusOr<T>::StatusOr(const T& value) : Base(value) {}
 
 template <typename T>
-struct StatusOrHelper::Specialize<T*> {
-  static inline bool IsValueNull(const T* t) { return t == NULL; }
-};
+StatusOr<T>::StatusOr(const Status& status) : Base(status) {}
 
-}  // namespace internal
-
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr()
-    : status_(tensorflow::error::UNKNOWN, "") {}
-
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr(Status status)
-    : status_(std::move(status)) {
-  if (status_.ok()) {
-    status_ = internal::StatusOrHelper::HandleInvalidStatusCtorArg();
-  }
+template <typename T>
+StatusOr<T>& StatusOr<T>::operator=(const Status& status) {
+  this->Assign(status);
+  return *this;
 }
 
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr(const T& value)
-    : value_(value) {
-  if (internal::StatusOrHelper::Specialize<T>::IsValueNull(value)) {
-    status_ = internal::StatusOrHelper::HandleNullObjectCtorArg();
-  }
+template <typename T>
+StatusOr<T>::StatusOr(T&& value) : Base(std::move(value)) {}
+
+template <typename T>
+StatusOr<T>::StatusOr(Status&& status) : Base(std::move(status)) {}
+
+template <typename T>
+StatusOr<T>& StatusOr<T>::operator=(Status&& status) {
+  this->Assign(std::move(status));
+  return *this;
 }
 
-template <typename T, bool CopyConstructible>
+template <typename T>
 template <typename U>
-inline StatusOr<T, CopyConstructible>::StatusOr(const StatusOr<U>& other)
-    : status_(other.status_), value_(other.value_) {}
+inline StatusOr<T>::StatusOr(const StatusOr<U>& other)
+    : Base(static_cast<const typename StatusOr<U>::Base&>(other)) {}
 
-template <typename T, bool CopyConstructible>
-inline StatusOr<T, CopyConstructible>::StatusOr(T&& value)
-    : value_(std::move(value)) {
-  if (internal::StatusOrHelper::Specialize<T>::IsValueNull(value_)) {
-    status_ = internal::StatusOrHelper::HandleNullObjectCtorArg();
-  }
-}
-
-template <typename T, bool CopyConstructible>
+template <typename T>
 template <typename U>
-inline StatusOr<T, CopyConstructible>::StatusOr(StatusOr<U>&& other)
-    : status_(std::move(other.status_)), value_(std::move(other.value_)) {}
-
-template <typename T, bool CopyConstructible>
-inline const T& StatusOr<T, CopyConstructible>::ValueOrDie() const {
-  if (!ok()) {
-    internal::StatusOrHelper::Crash(status());
-  }
-  return value_;
+inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
+  if (other.ok())
+    this->Assign(other.ValueOrDie());
+  else
+    this->Assign(other.status());
+  return *this;
 }
 
-template <typename T, bool CopyConstructible>
-inline T& StatusOr<T, CopyConstructible>::ValueOrDie() {
-  if (!status_.ok()) {
-    internal::StatusOrHelper::Crash(status());
+template <typename T>
+template <typename U>
+inline StatusOr<T>::StatusOr(StatusOr<U>&& other)
+    : Base(static_cast<typename StatusOr<U>::Base&&>(other)) {}
+
+template <typename T>
+template <typename U>
+inline StatusOr<T>& StatusOr<T>::operator=(StatusOr<U>&& other) {
+  if (other.ok()) {
+    this->Assign(std::move(other).ValueOrDie());
+  } else {
+    this->Assign(std::move(other).status());
   }
-  return value_;
+  return *this;
+}
+
+template <typename T>
+const Status& StatusOr<T>::status() const & {
+  return this->status_;
+}
+template <typename T>
+Status StatusOr<T>::status() && {
+  return ok() ? Status::OK() : std::move(this->status_);
+}
+
+template <typename T>
+const T& StatusOr<T>::ValueOrDie() const & {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+T& StatusOr<T>::ValueOrDie() & {
+  this->EnsureOk();
+  return this->data_;
+}
+
+template <typename T>
+const T&& StatusOr<T>::ValueOrDie() const && {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+T&& StatusOr<T>::ValueOrDie() && {
+  this->EnsureOk();
+  return std::move(this->data_);
+}
+
+template <typename T>
+void StatusOr<T>::IgnoreError() const {
+  // no-op
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/statusor_internals.h b/tensorflow/compiler/xla/statusor_internals.h
new file mode 100644
index 00000000000..a2fda5bb3c6
--- /dev/null
+++ b/tensorflow/compiler/xla/statusor_internals.h
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace internal_statusor {
+
+class Helper {
+ public:
+  // Move type-agnostic error handling to the .cc.
+  static void HandleInvalidStatusCtorArg(Status*);
+  TF_ATTRIBUTE_NORETURN static void Crash(const Status& status);
+};
+
+// Construct an instance of T in `p` through placement new, passing Args... to
+// the constructor.
+// This abstraction is here mostly for the gcc performance fix.
+template <typename T, typename... Args>
+void PlacementNew(void* p, Args&&... args) {
+#if defined(__GNUC__) && !defined(__clang__)
+  // Teach gcc that 'p' cannot be null, fixing code size issues.
+  if (p == nullptr) __builtin_unreachable();
+#endif
+  new (p) T(std::forward<Args>(args)...);
+}
+
+// Helper base class to hold the data and all operations.
+// We move all this to a base class to allow mixing with the appropriate
+// TraitsBase specialization.
+template <typename T>
+class StatusOrData {
+  template <typename U>
+  friend class StatusOrData;
+
+ public:
+  StatusOrData() = delete;
+
+  StatusOrData(const StatusOrData& other) {
+    if (other.ok()) {
+      MakeValue(other.data_);
+      MakeStatus();
+    } else {
+      MakeStatus(other.status_);
+    }
+  }
+
+  StatusOrData(StatusOrData&& other) noexcept {
+    if (other.ok()) {
+      MakeValue(std::move(other.data_));
+      MakeStatus();
+    } else {
+      MakeStatus(std::move(other.status_));
+    }
+  }
+
+  template <typename U>
+  StatusOrData(const StatusOrData<U>& other) {
+    if (other.ok()) {
+      MakeValue(other.data_);
+      MakeStatus();
+    } else {
+      MakeStatus(other.status_);
+    }
+  }
+
+  template <typename U>
+  StatusOrData(StatusOrData<U>&& other) {
+    if (other.ok()) {
+      MakeValue(std::move(other.data_));
+      MakeStatus();
+    } else {
+      MakeStatus(std::move(other.status_));
+    }
+  }
+
+  explicit StatusOrData(const T& value) : data_(value) { MakeStatus(); }
+  explicit StatusOrData(T&& value) : data_(std::move(value)) { MakeStatus(); }
+
+  explicit StatusOrData(const Status& status) : status_(status) {
+    EnsureNotOk();
+  }
+  explicit StatusOrData(Status&& status) : status_(std::move(status)) {
+    EnsureNotOk();
+  }
+
+  StatusOrData& operator=(const StatusOrData& other) {
+    if (this == &other) return *this;
+    if (other.ok())
+      Assign(other.data_);
+    else
+      Assign(other.status_);
+    return *this;
+  }
+
+  StatusOrData& operator=(StatusOrData&& other) {
+    if (this == &other) return *this;
+    if (other.ok())
+      Assign(std::move(other.data_));
+    else
+      Assign(std::move(other.status_));
+    return *this;
+  }
+
+  ~StatusOrData() {
+    if (ok()) {
+      status_.~Status();
+      data_.~T();
+    } else {
+      status_.~Status();
+    }
+  }
+
+  void Assign(const T& value) {
+    if (ok()) {
+      data_.~T();
+      MakeValue(value);
+    } else {
+      MakeValue(value);
+      status_ = Status::OK();
+    }
+  }
+
+  void Assign(T&& value) {
+    if (ok()) {
+      data_.~T();
+      MakeValue(std::move(value));
+    } else {
+      MakeValue(std::move(value));
+      status_ = Status::OK();
+    }
+  }
+
+  void Assign(const Status& status) {
+    Clear();
+    status_ = status;
+    EnsureNotOk();
+  }
+
+  void Assign(Status&& status) {
+    Clear();
+    status_ = std::move(status);
+    EnsureNotOk();
+  }
+
+  bool ok() const { return status_.ok(); }
+
+ protected:
+  // status_ will always be active after the constructor.
+  // We make it a union to be able to initialize exactly how we need without
+  // waste.
+  // Eg. in the copy constructor we use the default constructor of Status in
+  // the ok() path to avoid an extra Ref call.
+  union {
+    Status status_;
+  };
+
+  // data_ is active iff status_.ok()==true
+  struct Dummy {};
+  union {
+    // When T is const, we need some non-const object we can cast to void* for
+    // the placement new. dummy_ is that object.
+    Dummy dummy_;
+    T data_;
+  };
+
+  void Clear() {
+    if (ok()) data_.~T();
+  }
+
+  void EnsureOk() const {
+    if (!ok()) Helper::Crash(status_);
+  }
+
+  void EnsureNotOk() {
+    if (ok()) Helper::HandleInvalidStatusCtorArg(&status_);
+  }
+
+  // Construct the value (ie. data_) through placement new with the passed
+  // argument.
+  template <typename Arg>
+  void MakeValue(Arg&& arg) {
+    internal_statusor::PlacementNew<T>(&dummy_, std::forward<Arg>(arg));
+  }
+
+  // Construct the status (ie. status_) through placement new with the passed
+  // argument.
+  template <typename... Args>
+  void MakeStatus(Args&&... args) {
+    internal_statusor::PlacementNew<Status>(&status_,
+                                            std::forward<Args>(args)...);
+  }
+};
+
+// Helper base class to allow implicitly deleted constructors and assignment
+// operations in StatusOr.
+// TraitsBase will explicitly delete what it can't support and StatusOr will
+// inherit that behavior implicitly.
+template <bool Copy, bool Move>
+struct TraitsBase {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = default;
+  TraitsBase(TraitsBase&&) = default;
+  TraitsBase& operator=(const TraitsBase&) = default;
+  TraitsBase& operator=(TraitsBase&&) = default;
+};
+
+template <>
+struct TraitsBase<false, true> {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = delete;
+  TraitsBase(TraitsBase&&) = default;
+  TraitsBase& operator=(const TraitsBase&) = delete;
+  TraitsBase& operator=(TraitsBase&&) = default;
+};
+
+template <>
+struct TraitsBase<false, false> {
+  TraitsBase() = default;
+  TraitsBase(const TraitsBase&) = delete;
+  TraitsBase(TraitsBase&&) = delete;
+  TraitsBase& operator=(const TraitsBase&) = delete;
+  TraitsBase& operator=(TraitsBase&&) = delete;
+};
+
+}  // namespace internal_statusor
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index f8555113f81..5fa2211ac66 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -29,8 +29,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using tensorflow::Status;
-
 class Base1 {
  public:
   virtual ~Base1() {}
@@ -59,6 +57,14 @@ class CopyNoAssign {
   const CopyNoAssign& operator=(const CopyNoAssign&);
 };
 
+class NoDefaultConstructor {
+ public:
+  explicit NoDefaultConstructor(int foo);
+};
+
+static_assert(!std::is_default_constructible<NoDefaultConstructor>(),
+              "Should not be default-constructible.");
+
 StatusOr<std::unique_ptr<int>> ReturnUniquePtr() {
   // Uses implicit constructor from T&&
   return std::unique_ptr<int>(new int(0));
@@ -69,6 +75,18 @@ TEST(StatusOr, ElementType) {
   static_assert(std::is_same<StatusOr<char>::element_type, char>(), "");
 }
 
+TEST(StatusOr, TestNoDefaultConstructorInitialization) {
+  // Explicitly initialize it with an error code.
+  StatusOr<NoDefaultConstructor> statusor(tensorflow::errors::Cancelled(""));
+  EXPECT_FALSE(statusor.ok());
+  EXPECT_EQ(statusor.status().code(), tensorflow::error::CANCELLED);
+
+  // Default construction of StatusOr initializes it with an UNKNOWN error code.
+  StatusOr<NoDefaultConstructor> statusor2;
+  EXPECT_FALSE(statusor2.ok());
+  EXPECT_EQ(statusor2.status().code(), tensorflow::error::UNKNOWN);
+}
+
 TEST(StatusOr, TestMoveOnlyInitialization) {
   StatusOr<std::unique_ptr<int>> thing(ReturnUniquePtr());
   ASSERT_TRUE(thing.ok());

From 2f1ff0e90dc3ba80f6bbc3f9850e8028875dcbbf Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 25 Jul 2017 08:24:31 -0700
Subject: [PATCH 09/56] [TF:XLA] Register a no-op kernel for ControlTrigger,
 but forbid the JIT marking pass from compiling ControlTrigger nodes.

CL in preparation for compiling dynamic RNN gradients via XLA.

PiperOrigin-RevId: 163073212
---
 tensorflow/compiler/jit/mark_for_compilation_pass.cc | 5 +++++
 tensorflow/compiler/tf2xla/kernels/no_op.cc          | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 7eab7bb28f0..77b45aa11e2 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -257,6 +257,11 @@ Status MarkForCompilationPass::Run(
                                              &registration)) {
       return false;
     }
+
+    // Don't compile control trigger nodes. We won't preserve their deadness
+    // semantics correctly, so it's safest not to compile them.
+    if (node->IsControlTrigger()) return false;
+
     // If this device requires a JIT, we must say yes.
     if (registration->requires_compilation) return true;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/no_op.cc b/tensorflow/compiler/tf2xla/kernels/no_op.cc
index b8f0c0b9fe6..8c8a9bbe787 100644
--- a/tensorflow/compiler/tf2xla/kernels/no_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/no_op.cc
@@ -23,4 +23,9 @@ namespace tensorflow {
 // dummy operator using CompilationOnly().
 REGISTER_XLA_OP(Name("NoOp").CompilationOnly(), NoOp);
 
+// We register ControlTrigger as a no-op. This is correct since nodes seen
+// by the XLA compiler are never dead. This may need rethinking when we add
+// support for conditionals to XLA.
+REGISTER_XLA_OP(Name("ControlTrigger"), NoOp);
+
 }  // namespace tensorflow

From 7349de8e967c1c943a6b0844718f3933333aa8a3 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 25 Jul 2017 09:20:17 -0700
Subject: [PATCH 10/56] Improve the HLO graph dumper's output.

 - Truncate long shapes.  It's not uncommon to have giant tuples, and
   displaying the whole thing makes the graph unreadable.

 - Don't traverse into the users of a node with < 16 users.  These are
   probably not interesting, and traversing into them can quickly blow
   up the graph, making it un-renderable.

 - Allow nodes which have multiple trivial subcomputations (e.g.
   select-and-scatter) to have those computations inlined.

 - Match additional patterns in MatchTrivialComputation

PiperOrigin-RevId: 163079329
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 127 ++++++++++++++----
 1 file changed, 102 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index fcad1188a7f..f588df001c6 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -92,6 +92,9 @@ enum NodeFilterResult {
   // Style the node the same as kSomeOperandsOmitted, but also don't connect it
   // to its operands, even if they're present in the graph.
   kOmitNodeOperands,
+  // Same style as kSomeOperandsOmitted, but used to indicate that some of the
+  // node's *users* have been omitted.
+  kSomeUsersOmitted,
 };
 
 // NodeFilter is essentially a map from HloInstruction*s to NodeFilterResult.
@@ -118,6 +121,11 @@ class NodeFilter {
     auto result = filter_(instr);
     return result == kOmitNodeOperands || result == kSomeOperandsOmitted;
   }
+  bool Deemphasized(const HloInstruction* instr) const {
+    auto result = filter_(instr);
+    return result == kOmitNodeOperands || result == kSomeOperandsOmitted ||
+           result == kSomeUsersOmitted;
+  }
 
  private:
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
@@ -203,9 +211,15 @@ string HtmlLikeStringSanitize(tensorflow::StringPiece s) {
 //   "return param0 * param1;"      --> "multiply"
 //   "return min(param0, param1);"  --> "min"
 //   "return max(param0, param1);"  --> "max"
+//   "return param0 <= param1;"     --> "less-or-equal"
+//   "return param0 >= param1;"     --> "greater-or-equal"
+//   "return param0 >  param1;"     --> "greater-than"
+//   "return param0 <  param1;"     --> "less-than"
+//   "return param0 == param1;"     --> "equal-to"
+//   "return param0 != param1;"     --> "not-equal-to"
 //
-// where param0 and param1 are effective scalars.  Since all of the ops above
-// are commutative, we also support them with param0 and param1 swapped.
+// where param0 and param1 are effective scalars.  For the ops that are
+// commutative, we also support them with param0 and param1 swapped.
 //
 // This is useful primarily for reduce and map nodes.  These take a
 // subcomputation which is almost always one of the four above, and pattern
@@ -228,6 +242,7 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
       operand1->opcode() != HloOpcode::kParameter) {
     return nullopt;
   }
+
   // Check that the two operands of root are param0 and param1.  All of the
   // opcodes we recognize are commutative, so we're OK with either order.
   auto n0 = operand0->parameter_number();
@@ -236,6 +251,20 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
     return nullopt;
   }
 
+  // If the params are reversed, check that the operation being performed is
+  // commutative.
+  if (n0 == 1) {
+    switch (root->opcode()) {
+      case HloOpcode::kLe:
+      case HloOpcode::kGe:
+      case HloOpcode::kGt:
+      case HloOpcode::kLt:
+        return nullopt;
+      default:
+        break;
+    }
+  }
+
   // Check that the root and params are all effective scalars.
   if (!ShapeUtil::IsEffectiveScalar(root->shape()) ||
       !ShapeUtil::IsEffectiveScalar(operand0->shape()) ||
@@ -253,6 +282,18 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
       return "min";
     case HloOpcode::kMaximum:
       return "max";
+    case HloOpcode::kLe:
+      return "less-or-equal";
+    case HloOpcode::kGe:
+      return "greater-or-equal";
+    case HloOpcode::kGt:
+      return "greater-than";
+    case HloOpcode::kLt:
+      return "less-than";
+    case HloOpcode::kEq:
+      return "equal-to";
+    case HloOpcode::kNe:
+      return "not-equal-to";
     default:
       return nullopt;
   }
@@ -374,10 +415,8 @@ HloDotDumper::SubcomputationsToDump() {
       to_dump[instr->fused_instructions_computation()] = instr.get();
     }
 
-    const auto& subcomputations = instr->called_computations();
-    if (subcomputations.size() != 1 ||
-        !MatchTrivialComputation(subcomputations.front())) {
-      for (const HloComputation* comp : instr->called_computations()) {
+    for (const HloComputation* comp : instr->called_computations()) {
+      if (!MatchTrivialComputation(comp)) {
         to_dump[comp] = instr.get();
       }
     }
@@ -459,7 +498,7 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr,
   string in_edges = GetInstructionIncomingEdges(instr, filter);
 
   // Override the node's styling if it should be (de-)emphasized.
-  if (filter.SomeOrAllOperandsOmitted(instr)) {
+  if (filter.Deemphasized(instr)) {
     color = kDashedBorder;
   }
   if (filter.Highlight(instr)) {
@@ -657,7 +696,18 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   if (!opcode_specific_info.empty()) {
     lines.push_back(opcode_specific_info);
   }
-  lines.push_back(ShapeUtil::HumanString(instr->shape()));
+
+  // Some instructions have giant tuples as their shapes, so truncate the HLO's
+  // shape to kMaxShapeLen characters.
+  constexpr int kMaxShapeLen = 64;
+  string instr_shape = ShapeUtil::HumanString(instr->shape());
+  if (instr_shape.length() > kMaxShapeLen) {
+    instr_shape =
+        StrCat(tensorflow::StringPiece(instr_shape).substr(0, kMaxShapeLen - 3),
+               "...");
+  }
+  lines.push_back(instr_shape);
+
   if (show_addresses_) {
     lines.push_back(Printf("[%p]", instr));
   }
@@ -713,16 +763,22 @@ string HloDotDumper::GetInstructionTrivialComputationStr(
     return "";
   }
 
-  const auto& subcomps = instr->called_computations();
-  if (subcomps.size() != 1) {
-    return "";
+  std::vector<string> lines;
+  for (int64 i = 0; i < instr->called_computations().size(); ++i) {
+    optional<string> computation_type =
+        MatchTrivialComputation(instr->called_computations()[i]);
+    if (!computation_type) {
+      continue;
+    }
+    if (instr->called_computations().size() == 1) {
+      lines.push_back(Printf("Subcomputation: <b>%s</b>",
+                             HtmlLikeStringSanitize(*computation_type)));
+    } else {
+      lines.push_back(Printf("Subcomputation %lld: <b>%s</b>", i,
+                             HtmlLikeStringSanitize(*computation_type)));
+    }
   }
-  optional<string> computation_type = MatchTrivialComputation(subcomps.front());
-  if (!computation_type) {
-    return "";
-  }
-  return Printf("Subcomputation: <b>%s</b>",
-                HtmlLikeStringSanitize(*computation_type));
+  return Join(lines, "<br/>");
 }
 
 tensorflow::mutex& RendererMutex() {
@@ -829,14 +885,25 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
       }
     }
 
-    // If you're looking at node X, it's probably not interesting that node Y
-    // also happens to use the same constant, so we don't traverse into
-    // constants' users.
-    if (instr->opcode() != HloOpcode::kConstant) {
-      for (const HloInstruction* user : instr->users()) {
-        if (!nodes.count(user)) {
-          worklist.push_back({user, depth + 1});
-        }
+    // Traverse into instr's users, unless:
+    //
+    //  - there are a ton of them, in which case they're probably not
+    //    interesting (and anyway, rendering them all would make the graph
+    //    unreadable), or
+    //  - instr is a constant, in which case its users are probably not
+    //    interesting.
+    if (instr->opcode() == HloOpcode::kConstant) {
+      continue;
+    }
+    constexpr int kMaxUsersToRender = 16;
+    if (instr->user_count() > kMaxUsersToRender) {
+      // If we're going to skip this node's users, style it as such.
+      nodes[instr] = kSomeUsersOmitted;
+      continue;
+    }
+    for (const HloInstruction* user : instr->users()) {
+      if (!nodes.count(user)) {
+        worklist.push_back({user, depth + 1});
       }
     }
   }
@@ -885,6 +952,16 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
     }
   }
 
+  // Similarly, promote nodes with type kSomeUsersOmitted to kNormalNode if all
+  // of their users made it into the graph by other means.
+  for (auto& kv : nodes) {
+    const auto& users = kv.first->users();
+    if (kv.second == kSomeUsersOmitted &&
+        std::all_of(users.begin(), users.end(), is_displayed)) {
+      kv.second = kNormalNode;
+    }
+  }
+
   // If none of a node's operands appear in nodes, mark it as type
   // kOmitNodeOperands so it gets styled appropriately.
   for (auto& kv : nodes) {

From 3169f504faae8ede8443525a073567a512095c4f Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 25 Jul 2017 09:44:38 -0700
Subject: [PATCH 11/56] If the value to be forwarded from a loop to its
 gradient is a constant, clone the constant instead of repeatedly pushing it
 onto a stack on each iteration. This should never consume more memory than
 the stack approach (notwithstanding swapping), and frequently should be much
 better.

This change is in preparation for enabling XLA compilation of RNN gradients.

PiperOrigin-RevId: 163082165
---
 tensorflow/python/ops/control_flow_ops.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 44d6c7e2759..4ba812eaf5d 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -61,6 +61,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
@@ -983,9 +984,16 @@ class GradLoopState(object):
             # the right control flow context.
             real_value = self._grad_context.AddValue(cur_value)
             break
+        elif constant_op.is_constant(cur_value):
+          # If the value to be forwarded is a constant, clone the constant in
+          # the gradient loop rather than using a stack.
+          # TODO(phawkins): consider hoisting the constant out of the loop
+          # instead.
+          real_value = constant_op.constant(
+              tensor_util.constant_value(cur_value), dtype=cur_value.dtype)
+          break
         else:
           # Record the history of this value in forward_ctxt.
-          # TODO(yuanbyu): Avoid recording constants.
           self._grad_context.Exit()
           history_value = cur_grad_state.AddForwardAccumulator(cur_value)
           self._grad_context.Enter()

From 78cec04df0f714741f930ff3f234268102b71065 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 25 Jul 2017 09:46:59 -0700
Subject: [PATCH 12/56] [TF:XLA] Make the shape of a TensorArray flow value a
 scalar.

Previously we used an f32[0] value, since the exact flow value does not matter, however this causes problems when a TensorArray computation is placed in a loop since the shape of the flow value is no longer loop invariant.

PiperOrigin-RevId: 163082452
---
 tensorflow/compiler/tests/tensor_array_ops_test.py |  4 +++-
 .../compiler/tf2xla/kernels/tensor_array_ops.cc    | 14 ++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index f2773143524..ac039e01623 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -57,11 +57,13 @@ class TensorArrayTest(xla_test.XLATestCase):
       r0 = w2.read(0)
       r1 = w2.read(1)
       r2 = w2.read(2)
+      flow = w2.flow
 
-      d0, d1, d2 = session.run([r0, r1, r2])
+      d0, d1, d2, flow_val = session.run([r0, r1, r2, flow])
       self.assertAllEqual([[4.0, 5.0]], d0)
       self.assertAllEqual([[1.0, 3.0]], d1)
       self.assertAllEqual([[7.0, -8.5]], d2)
+      self.assertAllEqual([], flow_val.shape)
 
   def _testTensorArrayWritePack(self, tf_dtype):
     with self.test_session(), self.test_scope():
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index bdd52b7f8e5..34cc8b23159 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -182,7 +182,10 @@ class TensorArrayOp : public XlaOpKernel {
                                dtype_, value, &var));
     var->tensor_array_size = size;
     ctx->SetResourceOutput(0, var);
-    ctx->SetConstantOutput(1, Tensor(DT_FLOAT));
+
+    Tensor flow(DT_FLOAT, TensorShape({}));
+    flow.scalar<float>()() = 0.0f;
+    ctx->SetConstantOutput(1, flow);
   }
 
  private:
@@ -216,6 +219,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
     xla::ComputationDataHandle ta = resource->value;
     xla::ComputationDataHandle index = ctx->Input(1);
     xla::ComputationDataHandle value = ctx->Input(2);
+    xla::ComputationDataHandle flow = ctx->Input(3);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
     auto start_indices = XlaHelpers::PadWithZeros(b, index, elem_shape.dims());
@@ -228,7 +232,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
         DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
 
     resource->value = written;
-    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+    ctx->SetOutput(0, flow);
   }
 
  private:
@@ -369,6 +373,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
 
     xla::ComputationDataHandle ta = resource->value;
     const xla::ComputationDataHandle value = ctx->Input(2);
+    const xla::ComputationDataHandle flow = ctx->Input(3);
 
     auto slice_dims = value_shape.dim_sizes();
     slice_dims[0] = 1LL;
@@ -394,7 +399,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
     }
 
     resource->value = ta;
-    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+    ctx->SetOutput(0, flow);
   }
 
  private:
@@ -489,6 +494,7 @@ class TensorArraySplitOp : public XlaOpKernel {
                     lengths.size(), " vs. ", resource->tensor_array_size, ")"));
 
     const xla::ComputationDataHandle value = ctx->Input(1);
+    const xla::ComputationDataHandle flow = ctx->Input(3);
 
     OP_REQUIRES(ctx, value_shape.num_elements() == ta_shape.num_elements(),
                 errors::InvalidArgument("mismatched element count ",
@@ -497,7 +503,7 @@ class TensorArraySplitOp : public XlaOpKernel {
 
     resource->value = b->Add(ta, b->Reshape(value, ta_shape.dim_sizes()));
 
-    ctx->SetConstantOutput(0, Tensor(DT_FLOAT));
+    ctx->SetOutput(0, flow);
   }
 
  private:

From 1e78140f9d6a593294a13ef075de68f84f9d77ae Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 25 Jul 2017 09:54:52 -0700
Subject: [PATCH 13/56] Automated g4 rollback of changelist 163019166

PiperOrigin-RevId: 163083436
---
 tensorflow/core/grappler/clusters/BUILD       |   2 -
 .../core/grappler/clusters/single_machine.cc  |   8 +-
 .../grappler/clusters/single_machine_test.cc  | 121 ------------------
 3 files changed, 6 insertions(+), 125 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index e7230b37543..667023845cd 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -114,9 +114,7 @@ cc_test(
     deps = [
         ":single_machine",
         "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 3481b2b158d..a1531f1cfcf 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -73,6 +73,8 @@ SingleMachine::~SingleMachine() {
   // when we delete the session.
   thread_pool_.reset();
 
+  Reset(options_, {}).IgnoreError();
+
   CHECK(already_created);
   already_created = false;
 }
@@ -275,9 +277,11 @@ Status SingleMachine::ResetSession() {
     // Make sure the session is properly closed
     TF_RETURN_IF_ERROR(Shutdown());
 
-    // Destroying the object deletes all its varibles as well. This is only true
-    // for DirectSession.
+    // We need to Reset the session to ensure that all the variables are
+    // deleted. But first we need to delete the session since Reset()
+    // deletes some of the containers referenced by the session.
     session_.reset();
+    TF_RETURN_IF_ERROR(Reset(options_, {}));
   }
 
   LOG(INFO) << "Starting new session";
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index d7e2827afc9..b73b084793e 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -15,10 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -27,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/queue_runner.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -353,7 +349,6 @@ TEST_F(SingleMachineTest, InitializationMemory) {
 }
 
 namespace {
-
 template <class T>
 inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
   AttrValue attr_value;
@@ -468,122 +463,6 @@ TEST_F(SingleMachineTest, PersistentMemory) {
   EXPECT_TRUE(found_hashtable);
 }
 
-namespace {
-
-SessionOptions GetSessionOption(int num_cpu_cores, int num_gpus) {
-  SessionOptions options;
-  // Copied from single_machine.h
-  (*options.config.mutable_device_count())["CPU"] = 1;
-  if (num_gpus > 0) {
-    (*options.config.mutable_device_count())["GPU"] = num_gpus;
-  }
-  CHECK_GE(num_cpu_cores, 1);
-  options.config.set_intra_op_parallelism_threads(num_cpu_cores);
-  options.config.add_session_inter_op_thread_pool()->set_num_threads(
-      num_cpu_cores);
-  return options;
-}
-
-Status GetDeviceMemoryStats(
-    const SessionOptions& session_option,
-    std::unordered_map<string, AllocatorStats>* allocator_stats_by_device) {
-  std::vector<Device*> devices;
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(session_option,
-                                               "" /* name_prefix */, &devices));
-  allocator_stats_by_device->clear();
-  for (Device* device : devices) {
-    AllocatorStats stats;
-    auto* allocator = device->GetAllocator(AllocatorAttributes());
-    if (!allocator->TracksAllocationSizes()) {
-      return Status(error::INVALID_ARGUMENT,
-                    "Tracking allocation is not enabled.");
-    }
-    allocator->GetStats(&stats);
-    (*allocator_stats_by_device)[device->name()] = stats;
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-
-  // Add a variable and initializer.
-  Output a = ops::Variable(s.WithOpName("a"), TensorShape({128, 256}),
-                           DataType::DT_FLOAT);
-  Output a_init =
-      ops::RandomNormal(s.WithOpName("a/init"), {128, 256}, DataType::DT_FLOAT);
-  Output a_init_assign = ops::Assign(s.WithOpName("a/init/assign"), a, a_init);
-
-  // Add a resource variable.
-  Output b =
-      ops::VarHandleOp(s.WithOpName("b"), DataType::DT_FLOAT, {256, 512});
-  Output b_read =
-      ops::ReadVariableOp(s.WithOpName("b/read"), b, DataType::DT_FLOAT);
-  Output b_init =
-      ops::RandomNormal(s.WithOpName("b/init"), {256, 512}, DataType::DT_FLOAT);
-  auto b_init_assign =
-      ops::AssignVariableOp(s.WithOpName("b/init/assign"), b, b_init);
-
-  // Add a queue.
-  ops::FIFOQueue queue(s.WithOpName("queue"), {DataType::DT_STRING});
-  Output some_string =
-      ops::Const(s.WithOpName("some_string"), string("nothing"));
-  ops::QueueEnqueue enqueue(s.WithOpName("enqueue"), queue, {some_string});
-  ops::QueueDequeue dequeue(s.WithOpName("dequeue"), queue,
-                            {DataType::DT_STRING});
-
-  // Add a IdentityReader.
-  ops::IdentityReader reader(s.WithOpName("identity_reader"));
-  ops::ReaderRead read(s.WithOpName("read_from_queue"), reader, queue);
-
-  Output var_mul = ops::MatMul(s.WithOpName("var_matmul"), a, b_read);
-
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
-  QueueRunnerDef queue_runner;
-  queue_runner.set_queue_name("queue");
-  *queue_runner.add_enqueue_op_name() = "enqueue";
-  item.queue_runners.push_back(queue_runner);
-
-  item.init_ops.push_back("a/init/assign");
-  item.init_ops.push_back("b/init/assign");
-  item.fetch.push_back("var_matmul");
-  item.fetch.push_back("dequeue");
-
-  // Run the graph
-  TF_CHECK_OK(cluster_->Initialize(item));
-  EnableCPUAllocatorStats(true);
-
-  SessionOptions options =
-      GetSessionOption(3 /* cpu cores */, 0 /* num gpus */);
-  std::unordered_map<string, AllocatorStats> device_memory_before;
-  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_before));
-  EXPECT_EQ(device_memory_before.size(), 1);
-
-  RunMetadata metadata;
-  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
-
-  // Check there is memory that is not released.
-  std::unordered_map<string, AllocatorStats> device_memory;
-  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory));
-  EXPECT_EQ(device_memory.size(), 1);
-  EXPECT_GT(device_memory.begin()->second.bytes_in_use, 0);
-
-  // Reset cluster_ would release all memory.
-  cluster_.reset();
-  std::unordered_map<string, AllocatorStats> device_memory_after;
-  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_after));
-
-  // Check memory used by resources are released after cluster destruction.
-  EXPECT_EQ(device_memory_before.size(), 1);
-  EXPECT_EQ(device_memory_after.size(), 1);
-  EXPECT_EQ(device_memory_before.begin()->second.bytes_in_use, 0);
-  EXPECT_EQ(device_memory_after.begin()->second.bytes_in_use, 0);
-}
-
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow

From 72e7fdc3a569670d8cdf5d6b9a7387848d1c5d5a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 10:14:47 -0700
Subject: [PATCH 14/56] Automated g4 rollback of changelist 162769374

PiperOrigin-RevId: 163086518
---
 tensorflow/core/platform/default/build_config_root.bzl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 1a59c471e5a..fa4ac4ba73f 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -3,10 +3,10 @@
 # be separate to avoid cyclic references.
 
 def tf_cuda_tests_tags():
-  return []
+  return ["local"]
 
 def tf_sycl_tests_tags():
-  return []
+  return ["local"]
 
 def tf_additional_plugin_deps():
   return select({
@@ -28,7 +28,7 @@ def tf_additional_verbs_deps():
       "//tensorflow:with_verbs_support": [
           "//tensorflow/contrib/verbs:verbs_server_lib",
           "//tensorflow/contrib/verbs:grpc_verbs_client",
-      ],
+      ], 
       "//conditions:default": [],
   })
 

From 62ea787c20e7c1ae9848fd5f7f2f0aa8b0d8f515 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 10:28:22 -0700
Subject: [PATCH 15/56] internal change

PiperOrigin-RevId: 163088509
---
 tensorflow/core/platform/macros.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index eaf0171e72d..47523c7d2b0 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -20,6 +20,7 @@ limitations under the License.
 #if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG)
 // Compiler supports GCC-style attributes
 #define TF_ATTRIBUTE_NORETURN __attribute__((noreturn))
+#define TF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
 #define TF_ATTRIBUTE_NOINLINE __attribute__((noinline))
 #define TF_ATTRIBUTE_UNUSED __attribute__((unused))
 #define TF_ATTRIBUTE_COLD __attribute__((cold))
@@ -33,6 +34,7 @@ limitations under the License.
 #elif defined(COMPILER_MSVC)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
+#define TF_ATTRIBUTE_ALWAYS_INLINE
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
@@ -43,6 +45,7 @@ limitations under the License.
 #else
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN
+#define TF_ATTRIBUTE_ALWAYS_INLINE
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD

From a2e63e96be4945680a81e44e4b0486cbaa476185 Mon Sep 17 00:00:00 2001
From: Thomas Schumm <fwiffo@google.com>
Date: Tue, 25 Jul 2017 10:34:27 -0700
Subject: [PATCH 16/56] Clarify docstring for tf.rank.

PiperOrigin-RevId: 163089480
---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index bb86640eaba..f64c89ac5d2 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -330,7 +330,7 @@ def rank(input, name=None):
   # pylint: disable=redefined-builtin
   """Returns the rank of a tensor.
 
-  This operation returns an integer representing the rank of `input`.
+  Returns a 0-D `int32` `Tensor` representing the rank of `input`.
 
   For example:
 

From abcc81ddbcf5bb49488667619cb054e269b6d2ab Mon Sep 17 00:00:00 2001
From: RJ Ryan <rjryan@google.com>
Date: Tue, 25 Jul 2017 10:41:06 -0700
Subject: [PATCH 17/56] Reduce gather_op_test timeouts by reducing the size of
 testHigherRank.

PiperOrigin-RevId: 163090428
---
 .../python/kernel_tests/gather_op_test.py     | 100 +++++++++---------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 04d65b88a1c..9a946925693 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -88,58 +88,58 @@ class GatherTest(test.TestCase):
 
   def testHigherRank(self):
     # We check that scalar and empty indices shapes work as well
-    for shape in (4, 3, 2), (2, 1, 3, 2):
-      for indices_shape in (), (0,), (3, 0), (3, 5), (5, 2, 3):
-        for dtype in _TEST_TYPES:
-          for axis in range(len(shape)):
-            params = self._buildParams(np.random.randn(*shape), dtype)
-            indices = np.random.randint(shape[axis], size=indices_shape)
-            with self.test_session(use_gpu=True) as sess:
-              tf_params = constant_op.constant(params)
-              tf_indices = constant_op.constant(indices)
-              # Check that both positive and negative indices for axis work.
-              tf_axis = constant_op.constant(axis)
-              tf_negative_axis = constant_op.constant(-len(shape) + axis)
-              gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
-              gather_negative_axis = array_ops.gather(
-                  tf_params, tf_indices, axis=tf_negative_axis)
-              gather_value, gather_negative_axis_value = sess.run(
-                  [gather, gather_negative_axis])
-              gather_np = np.take(params, indices, axis)
-              self.assertAllEqual(gather_np, gather_value)
-              self.assertAllEqual(gather_np, gather_negative_axis_value)
-              expected_shape = (params.shape[:axis] + indices.shape +
-                                params.shape[axis + 1:])
-              self.assertEqual(expected_shape, gather.shape)
-              self.assertEqual(expected_shape, gather_negative_axis.shape)
+    shape = (2, 1, 3, 2)
+    for indices_shape in (), (0,), (2, 0), (2, 3):
+      for dtype in _TEST_TYPES:
+        for axis in range(len(shape)):
+          params = self._buildParams(np.random.randn(*shape), dtype)
+          indices = np.random.randint(shape[axis], size=indices_shape)
+          with self.test_session(use_gpu=True) as sess:
+            tf_params = constant_op.constant(params)
+            tf_indices = constant_op.constant(indices)
+            # Check that both positive and negative indices for axis work.
+            tf_axis = constant_op.constant(axis)
+            tf_negative_axis = constant_op.constant(-len(shape) + axis)
+            gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
+            gather_negative_axis = array_ops.gather(
+                tf_params, tf_indices, axis=tf_negative_axis)
+            gather_value, gather_negative_axis_value = sess.run(
+                [gather, gather_negative_axis])
+            gather_np = np.take(params, indices, axis)
+            self.assertAllEqual(gather_np, gather_value)
+            self.assertAllEqual(gather_np, gather_negative_axis_value)
+            expected_shape = (params.shape[:axis] + indices.shape +
+                              params.shape[axis + 1:])
+            self.assertEqual(expected_shape, gather.shape)
+            self.assertEqual(expected_shape, gather_negative_axis.shape)
 
-              # Test gradients
-              gather_grad = np.random.randn(
-                  *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
-              if dtype.is_complex:
-                gather_grad -= 1j * gather_grad
-              params_grad, indices_grad, axis_grad = gradients_impl.gradients(
-                  gather, [tf_params, tf_indices, tf_axis], gather_grad)
-              self.assertEqual(indices_grad, None)
-              self.assertEqual(axis_grad, None)
-              # For axis 0, we are able to create an efficient IndexedSlices for
-              # the gradient.
-              if axis == 0:
-                self.assertEqual(type(params_grad), ops.IndexedSlices)
-                params_grad = ops.convert_to_tensor(params_grad)
-              correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
-              outer_dims = axis
-              inner_dims = len(shape) - axis - 1
-              gather_grad = gather_grad.reshape(
-                  shape[:axis] + (indices.size,) + shape[axis + 1:])
-              for source_index, dest_index in enumerate(indices.flat):
-                dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
+            # Test gradients
+            gather_grad = np.random.randn(
+                *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
+            if dtype.is_complex:
+              gather_grad -= 1j * gather_grad
+            params_grad, indices_grad, axis_grad = gradients_impl.gradients(
+                gather, [tf_params, tf_indices, tf_axis], gather_grad)
+            self.assertEqual(indices_grad, None)
+            self.assertEqual(axis_grad, None)
+            # For axis 0, we are able to create an efficient IndexedSlices for
+            # the gradient.
+            if axis == 0:
+              self.assertEqual(type(params_grad), ops.IndexedSlices)
+              params_grad = ops.convert_to_tensor(params_grad)
+            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+            outer_dims = axis
+            inner_dims = len(shape) - axis - 1
+            gather_grad = gather_grad.reshape(
+                shape[:axis] + (indices.size,) + shape[axis + 1:])
+            for source_index, dest_index in enumerate(indices.flat):
+              dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
+                            (slice(None),) * inner_dims)
+              source_slice = ((slice(None),) * outer_dims + (source_index,) +
                               (slice(None),) * inner_dims)
-                source_slice = ((slice(None),) * outer_dims + (source_index,) +
-                                (slice(None),) * inner_dims)
-                correct_params_grad[dest_slice] += gather_grad[source_slice]
-              self.assertAllClose(correct_params_grad, params_grad.eval(),
-                                  atol=2e-6, rtol=2e-6)
+              correct_params_grad[dest_slice] += gather_grad[source_slice]
+            self.assertAllClose(correct_params_grad, params_grad.eval(),
+                                atol=2e-6, rtol=2e-6)
 
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])

From a7a69520e1b0c0caff6e1213740cc7ad7f2f95b2 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 25 Jul 2017 10:43:51 -0700
Subject: [PATCH 18/56] Add PopulationCount op (popcnt): element-wise counts
 the number of "on" bits.

PiperOrigin-RevId: 163090921
---
 tensorflow/contrib/makefile/tf_op_files.txt   |   1 +
 tensorflow/core/kernels/BUILD                 |   9 +
 .../core/kernels/population_count_op.cc       | 163 ++++++++++++++++++
 tensorflow/core/kernels/population_count_op.h |  38 ++++
 .../kernels/population_count_op_gpu.cu.cc     |  92 ++++++++++
 tensorflow/core/ops/bitwise_ops.cc            |  16 ++
 tensorflow/python/ops/bitwise_ops.py          |   1 +
 tensorflow/python/ops/bitwise_ops_test.py     |  23 +++
 8 files changed, 343 insertions(+)
 create mode 100644 tensorflow/core/kernels/population_count_op.cc
 create mode 100644 tensorflow/core/kernels/population_count_op.h
 create mode 100644 tensorflow/core/kernels/population_count_op_gpu.cu.cc

diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 4a3b3e77628..0511f8a65c9 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -197,6 +197,7 @@ tensorflow/core/kernels/aggregate_ops.cc
 tensorflow/core/kernels/depthwise_conv_op.cc
 tensorflow/core/kernels/dequantize_op.cc
 tensorflow/core/kernels/meta_support.cc
+tensorflow/core/kernels/population_count_op.cc
 tensorflow/core/kernels/quantization_utils.cc
 tensorflow/core/kernels/quantize_down_and_shrink_range.cc
 tensorflow/core/kernels/quantize_op.cc
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index fffcb980db2..d35a96a24a4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2348,6 +2348,7 @@ cc_library(
         ":cwise_op",
         ":fft_ops",
         ":matmul_op",
+        ":population_count_op",
         ":reduction_ops",
         ":scan_ops",
         ":segment_reduction_ops",
@@ -2409,6 +2410,12 @@ tf_kernel_library(
     deps = MATH_DEPS + ["//tensorflow/core:bitwise_ops_op_lib"],
 )
 
+tf_kernel_library(
+    name = "population_count_op",
+    prefix = "population_count_op",
+    deps = MATH_DEPS,
+)
+
 tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
@@ -4292,6 +4299,8 @@ filegroup(
         "fake_quant_ops.cc",
         "fifo_queue.cc",
         "fused_batch_norm_op.cc",
+        "population_count_op.cc",
+        "population_count_op.h",
         "winograd_transform.h",
         ":android_extended_ops_headers",
     ] + select({
diff --git a/tensorflow/core/kernels/population_count_op.cc b/tensorflow/core/kernels/population_count_op.cc
new file mode 100644
index 00000000000..12ff6b69f87
--- /dev/null
+++ b/tensorflow/core/kernels/population_count_op.cc
@@ -0,0 +1,163 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc
+
+#define EIGEN_USE_THREADS
+
+#include <bitset>
+
+#include "tensorflow/core/kernels/population_count_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class PopulationCountOp : public OpKernel {
+ public:
+  explicit PopulationCountOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input_t = c->input(0);
+    Tensor* output_t;
+    OP_REQUIRES_OK(c, c->allocate_output(0, input_t.shape(), &output_t));
+
+    auto input = input_t.flat<T>();
+    auto output = output_t->flat<uint8>();
+
+    functor::PopulationCount<Device, T> popcnt;
+    popcnt(c, input, output);
+  }
+};
+
+#define REGISTER_POPULATION_COUNT(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("PopulationCount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      PopulationCountOp<CPUDevice, type>);
+
+TF_CALL_uint8(REGISTER_POPULATION_COUNT);
+TF_CALL_int8(REGISTER_POPULATION_COUNT);
+TF_CALL_uint16(REGISTER_POPULATION_COUNT);
+TF_CALL_int16(REGISTER_POPULATION_COUNT);
+TF_CALL_int32(REGISTER_POPULATION_COUNT);
+TF_CALL_int64(REGISTER_POPULATION_COUNT);
+
+#undef REGISTER_POPULATION_COUNT
+
+namespace functor {
+
+namespace {
+
+template <typename T>
+inline uint8 PopCnt(const T v);
+
+#define POPCNT(T, N)                  \
+  template <>                         \
+  uint8 PopCnt<T>(const T v) {        \
+    return std::bitset<N>(v).count(); \
+  }
+
+POPCNT(int8, 8);
+POPCNT(uint8, 8);
+POPCNT(int16, 16);
+POPCNT(uint16, 16);
+POPCNT(int32, 32);
+POPCNT(int64, 64);
+
+#undef POPCNT
+
+}  // namespace
+
+template <typename T>
+struct PopulationCount<CPUDevice, T> {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
+                  TTypes<uint8>::Flat output) {
+    const T* input_ptr = input.data();
+    uint8* output_ptr = output.data();
+    auto shard = [input_ptr, output_ptr](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        output_ptr[i] = PopCnt<T>(input_ptr[i]);
+      }
+    };
+    int64 total_shards = input.size();
+    // Approximating cost of popcnt: convert T to int64
+    // (std::bitset constructor) and convert int64 to uint8
+    // (bitset.count() -> output).  The .count() itself is relatively cheap.
+    const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() +
+                               Eigen::TensorOpCost::CastCost<int64, uint8>());
+    const int64 shard_cost = (total_cost >= static_cast<double>(kint64max))
+                                 ? kint64max
+                                 : static_cast<int64>(total_cost);
+
+    auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
+          shard_cost, shard);
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+
+#define REGISTER_POPULATION_COUNT(type)                                     \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("PopulationCount").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      PopulationCountOp<GPUDevice, type>)
+
+TF_CALL_uint8(REGISTER_POPULATION_COUNT);
+TF_CALL_int8(REGISTER_POPULATION_COUNT);
+TF_CALL_uint16(REGISTER_POPULATION_COUNT);
+TF_CALL_int16(REGISTER_POPULATION_COUNT);
+TF_CALL_int32(REGISTER_POPULATION_COUNT);
+TF_CALL_int64(REGISTER_POPULATION_COUNT);
+
+#undef REGISTER_POPULATION_COUNT
+
+namespace functor {
+
+#define DECLARE_GPU_SPEC(T)                                    \
+  template <>                                                  \
+  void PopulationCount<GPUDevice, T>::operator()(              \
+      OpKernelContext* c, typename TTypes<T>::ConstFlat input, \
+      TTypes<uint8>::Flat output);                             \
+  extern template struct PopulationCount<GPUDevice, T>
+
+TF_CALL_uint8(DECLARE_GPU_SPEC);
+TF_CALL_int8(DECLARE_GPU_SPEC);
+TF_CALL_uint16(DECLARE_GPU_SPEC);
+TF_CALL_int16(DECLARE_GPU_SPEC);
+TF_CALL_int32(DECLARE_GPU_SPEC);
+TF_CALL_int64(DECLARE_GPU_SPEC);
+
+#undef DECLARE_GPU_SPEC
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/population_count_op.h b/tensorflow/core/kernels/population_count_op.h
new file mode 100644
index 00000000000..de89582e139
--- /dev/null
+++ b/tensorflow/core/kernels/population_count_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct PopulationCount {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
+                  TTypes<uint8>::Flat output);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
new file mode 100644
index 00000000000..27a687ba409
--- /dev/null
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/population_count_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+__global__ void PopulationCountKernel(const int size, const T* input,
+                                      uint8* output) {
+  CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); }
+}
+
+template <>
+__global__ void PopulationCountKernel(const int size, const int8* input,
+                                      uint8* output) {
+  // For some reason, __popc on a negative int8 gets confused.
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i)));
+  }
+}
+
+template <>
+__global__ void PopulationCountKernel(const int size, const int16* input,
+                                      uint8* output) {
+  // For some reason, __popc on a negative int16 gets confused.
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i)));
+  }
+}
+
+template <>
+__global__ void PopulationCountKernel<int64>(const int size, const int64* input,
+                                             uint8* output) {
+  CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
+}
+
+#define DEFINE_GPU_SPECS(T)                                               \
+  template <>                                                             \
+  void PopulationCount<GPUDevice, T>::operator()(                         \
+      OpKernelContext* c, typename TTypes<T>::ConstFlat input,            \
+      TTypes<uint8>::Flat output) {                                       \
+    const GPUDevice& d = c->eigen_device<GPUDevice>();                    \
+    int64 total_count = input.size();                                     \
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);        \
+    PopulationCountKernel<T>                                              \
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \
+            total_count, input.data(), output.data());                    \
+  }
+
+TF_CALL_uint8(DEFINE_GPU_SPECS);
+TF_CALL_int8(DEFINE_GPU_SPECS);
+TF_CALL_uint16(DEFINE_GPU_SPECS);
+TF_CALL_int16(DEFINE_GPU_SPECS);
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
+
+#undef DEFINE_GPU_SPECS
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/ops/bitwise_ops.cc b/tensorflow/core/ops/bitwise_ops.cc
index 2005d5e1028..3ffc4ab74af 100644
--- a/tensorflow/core/ops/bitwise_ops.cc
+++ b/tensorflow/core/ops/bitwise_ops.cc
@@ -40,6 +40,22 @@ computation is performed on the underlying representation of x.
       .Attr("T: {int8, int16, int32, int64, uint8, uint16}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
+REGISTER_OP("PopulationCount")
+    .Input("x: T")
+    .Output("y: uint8")
+    .Attr("T: {int8, int16, int32, int64, uint8, uint16}")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+
+For each entry in `x`, calculates the number of `1` (on) bits in the binary
+representation of that entry.
+
+**NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+`int32` or `int64` and perform the bitcount on the result, than to feed in
+8- or 16-bit inputs and then aggregate the resulting counts.
+)doc");
+
 REGISTER_OP("BitwiseAnd").BINARY_BITWISE().Doc(R"doc(
 Elementwise computes the bitwise AND of `x` and `y`.
 
diff --git a/tensorflow/python/ops/bitwise_ops.py b/tensorflow/python/ops/bitwise_ops.py
index cbabc3ed9ba..44daf135370 100644
--- a/tensorflow/python/ops/bitwise_ops.py
+++ b/tensorflow/python/ops/bitwise_ops.py
@@ -36,5 +36,6 @@ ops.NotDifferentiable("BitwiseAnd")
 ops.NotDifferentiable("BitwiseOr")
 ops.NotDifferentiable("BitwiseXor")
 ops.NotDifferentiable("Invert")
+ops.NotDifferentiable("PopulationCount")
 
 remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index 904cf99a5ab..1d08c8f82dc 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+import six
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import bitwise_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.platform import googletest
 
 
@@ -46,6 +50,25 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(or_result, [5, 5, 7, 15])
         self.assertAllEqual(xor_result, [5, 5, 4, 5])
 
+  def testPopulationCountOp(self):
+    dtype_list = [dtypes.int8, dtypes.int16,
+                  dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]
+    raw_inputs = [0, 1, -1, 3, -3, 5, -5, 14, -14,
+                  127, 128, 255, 256, 65535, 65536,
+                  2**31 - 1, 2**31, 2**32 - 1, 2**32, -2**32 + 1, -2**32,
+                  -2**63 + 1, 2**63 - 1]
+    def count_bits(x):
+      return sum([bin(z).count("1") for z in six.iterbytes(x.tobytes())])
+    for dtype in dtype_list:
+      with self.test_session(use_gpu=True) as sess:
+        print("PopulationCount test: ", dtype)
+        inputs = np.array(raw_inputs, dtype=dtype.as_numpy_dtype)
+        truth = [count_bits(x) for x in inputs]
+        input_tensor = constant_op.constant(inputs, dtype=dtype)
+        popcnt_result = sess.run(gen_bitwise_ops.population_count(input_tensor))
+        self.assertAllEqual(truth, popcnt_result)
+
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16]

From d05bf9a30f8297fea4fa391702d17203767d0c3c Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 25 Jul 2017 10:47:21 -0700
Subject: [PATCH 19/56] Show fusion nodes inline in HLO graph dumper.

To make this work sanely I had to change NodeFilter so that it says to
dump all nodes inside subcomputations.  Previously, we passed an
explicit NodeFilter down to DumpSubcomputation, and used that to control
whether or not we dumped nodes in there.  But this becomes unwieldy with
inline fusion nodes, as sometimes you want to look at 'filter', and
other times you want to look at 'filter_', and there's no good way to
tell why.

I also had to remove the heuristic whereby we'd pull in operands of
nodes with just some operands shown.  With the much bigger nodes that
are generated by this change, the graph was becoming illegible.  I think
most of the confusion that heuristic was attempting to avoid is
addressed by the fact that we "gray out" incomplete nodes.

PiperOrigin-RevId: 163091423
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 234 ++++++++++--------
 1 file changed, 124 insertions(+), 110 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index f588df001c6..c6202548f1c 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -127,6 +127,11 @@ class NodeFilter {
            result == kSomeUsersOmitted;
   }
 
+  bool ShowFusionSubcomputation(const HloInstruction* instr) const {
+    CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
+    return Show(instr) && !SomeOrAllOperandsOmitted(instr);
+  }
+
  private:
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
 };
@@ -334,15 +339,14 @@ class HloDotDumper {
 
   string DumpSubcomputation(const HloComputation* subcomp,
                             const HloInstruction* parent_instr);
-  string DumpComputation(const HloComputation* comp, const NodeFilter& filter);
-  string DumpInstruction(const HloInstruction* instr, const NodeFilter& filter);
+  string DumpComputation(const HloComputation* comp);
+  string DumpInstruction(const HloInstruction* instr);
   ColorScheme GetInstructionColor(const HloInstruction* instr);
   string GetInstructionNodeShape(const HloInstruction* instr);
   string GetInstructionNodeLabel(const HloInstruction* instr);
   string GetInstructionNodeExtraInfo(const HloInstruction* instr);
-  string GetInstructionIncomingEdges(const HloInstruction* instr,
-                                     const NodeFilter& filter);
   string GetInstructionNodeInlinedConstants(const HloInstruction* instr);
+  void AddInstructionIncomingEdges(const HloInstruction* instr);
 
   // If instr has just one computation and it's trivial (e.g. "return param0 +
   // param1"), returns a string you can put into the node's body that names the
@@ -355,6 +359,12 @@ class HloDotDumper {
   const bool show_layouts_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
+
+  // Edges to print from Footer().  Edges come at the end because graphviz is
+  // unhappy if an edge from a subcomputation to a node in the outer computation
+  // appears before both the inner computation and the destination node are
+  // defined.
+  std::vector<string> edges_;
 };
 
 string HloDotDumper::Dump() {
@@ -364,7 +374,7 @@ string HloDotDumper::Dump() {
     const HloInstruction* parent = kv.second;
     StrAppend(&g, DumpSubcomputation(subcomp, parent));
   }
-  StrAppend(&g, DumpComputation(computation_, filter_));
+  StrAppend(&g, DumpComputation(computation_));
   StrAppend(&g, Footer());
   return g;
 }
@@ -397,7 +407,7 @@ stylesheet="
   return Printf(fmt, graph_label);
 }
 
-string HloDotDumper::Footer() { return "}\n"; }
+string HloDotDumper::Footer() { return StrCat(Join(edges_, "\n"), "\n}"); }
 
 std::unordered_map<const HloComputation*, const HloInstruction*>
 HloDotDumper::SubcomputationsToDump() {
@@ -427,8 +437,7 @@ HloDotDumper::SubcomputationsToDump() {
 string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp,
                                         const HloInstruction* parent_instr) {
   const char* computation_fmt = R"(subgraph %s {
-style = "%s";
-color = "%s";
+%s;
 label = <%s>;
 labelloc = t;
 %s
@@ -436,57 +445,68 @@ labelloc = t;
 
 )";
 
-  const char* edge_fmt = R"(%s -> %s [ltail="%s", %s];)";
-
   string id = SubcomputationId(subcomp);
 
-  string subcomp_label, style, edge_or_bg_color, edge_attrs;
+  string subcomp_label, style;
   if (parent_instr->opcode() == HloOpcode::kFusion) {
     subcomp_label = Printf("Fused expression for <b>%s</b><br/>%s",
                            HtmlLikeStringSanitize(parent_instr->name()),
                            HtmlLikeStringSanitize(parent_instr->ToCategory()));
-    style = "rounded,filled";
-    edge_or_bg_color = "lightgray";
-    edge_attrs = "style=dotted, arrowsize=0";
+
+    // Subcomputation's fill/stroke color is light/dark red/gray, depending on
+    // whether or not the subcomputation's fusion node is highlighted.
+    bool highlight = filter_.Highlight(parent_instr);
+    const char* fillcolor = highlight ? "#ffcdd2" : "#f5f5f5";
+    const char* strokecolor = highlight ? "#b71c1c" : "#c2c2c2";
+    style = Printf(R"(style="rounded,filled,bold"; fillcolor="%s"; color="%s")",
+                   fillcolor, strokecolor);
   } else {
     subcomp_label = Printf("Subcomputation for <b>%s</b><br/>%s",
                            HtmlLikeStringSanitize(parent_instr->name()),
                            HtmlLikeStringSanitize(subcomp->name()));
-    style = "rounded";
-    edge_or_bg_color = "black";
-    edge_attrs = "style=dashed";
+    style = "style=rounded; color=black;";
   }
 
-  // Pass an empty filter to DumpComputation -- we always dump the entirety of a
-  // subcomputation.
-  string comp_body = DumpComputation(subcomp, NodeFilter());
-  string computation = Printf(computation_fmt, id, style, edge_or_bg_color,
-                              subcomp_label, comp_body, id);
-  string edge = Printf(edge_fmt, InstructionId(subcomp->root_instruction()),
-                       InstructionId(parent_instr), SubcomputationId(subcomp),
-                       edge_attrs);
-  return StrCat(computation, "\n", edge, "\n");
+  string comp_body = DumpComputation(subcomp);
+  string computation =
+      Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
+
+  // Add an edge from the subcomputation to its parent node.  If subcomp
+  // belongs to a fusion node, it's drawn in place of the fusion instruction, so
+  // there's no need to link those.
+  if (parent_instr->opcode() != HloOpcode::kFusion) {
+    const char* edge_fmt = R"(%s -> %s [ltail="%s", style="dashed"];)";
+    edges_.push_back(
+        Printf(edge_fmt, InstructionId(subcomp->root_instruction()),
+               InstructionId(parent_instr), SubcomputationId(subcomp)));
+  }
+
+  return computation;
 }
 
-string HloDotDumper::DumpComputation(const HloComputation* comp,
-                                     const NodeFilter& filter) {
+string HloDotDumper::DumpComputation(const HloComputation* comp) {
   string g;
   for (const auto& instr : comp->instructions()) {
-    if (!filter.Show(instr.get())) {
+    if (!filter_.Show(instr.get())) {
       continue;
     }
-    StrAppend(&g, DumpInstruction(instr.get(), filter));
+    StrAppend(&g, DumpInstruction(instr.get()));
   }
   return g;
 }
 
-string HloDotDumper::DumpInstruction(const HloInstruction* instr,
-                                     const NodeFilter& filter) {
+string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
   // We don't display constants as separate nodes; they're merged into their
   // users.
   if (instr->opcode() == HloOpcode::kConstant) {
     return "";
   }
+  // Omit the fusion node if its subcomputation is drawn, since the
+  // subcomputation will be drawn inline.
+  if (instr->opcode() == HloOpcode::kFusion &&
+      filter_.ShowFusionSubcomputation(instr)) {
+    return "";
+  }
 
   ColorScheme color = GetInstructionColor(instr);
   string node_shape = GetInstructionNodeShape(instr);
@@ -494,14 +514,13 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr,
   string extra_info = GetInstructionNodeExtraInfo(instr);
   string inlined_constants = GetInstructionNodeInlinedConstants(instr);
   string trivial_subcomputation = GetInstructionTrivialComputationStr(instr);
-
-  string in_edges = GetInstructionIncomingEdges(instr, filter);
+  AddInstructionIncomingEdges(instr);
 
   // Override the node's styling if it should be (de-)emphasized.
-  if (filter.Deemphasized(instr)) {
+  if (filter_.Deemphasized(instr)) {
     color = kDashedBorder;
   }
-  if (filter.Highlight(instr)) {
+  if (filter_.Highlight(instr)) {
     node_shape = "diamond";
     color = kDarkRed;
   }
@@ -515,34 +534,44 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr,
     }
   }
 
-  string node = Printf("%s [label=<%s>, shape=%s, %s];", InstructionId(instr),
-                       node_body, node_shape, NodeColorAttributes(color));
-  return StrCat(node, "\n", in_edges);
+  return Printf("%s [label=<%s>, shape=%s, %s];\n", InstructionId(instr),
+                node_body, node_shape, NodeColorAttributes(color));
 }
 
 string HloDotDumper::GetInstructionNodeInlinedConstants(
     const HloInstruction* instr) {
+  auto stringify_constant = [](const HloInstruction* constant) {
+    if (ShapeUtil::IsEffectiveScalar(constant->shape())) {
+      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
+          constant->shape(), /*linear_index=*/0);
+      return Printf("%s{%s}", ShapeUtil::HumanString(constant->shape()),
+                    constant->literal().GetAsString(elem_idx));
+    }
+    if (tensorflow::StringPiece(constant->name()).starts_with("%constant")) {
+      return constant->name();
+    }
+    return StrCat("constant ", constant->name());
+  };
+
+  // Special case: If instr is a parameter to a fusion node, check whether the
+  // corresponding operand to the fusion node is a constant.
+  if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) {
+    const HloInstruction* fusion = instr->fusion_instruction();
+    const HloInstruction* operand = fusion->operand(instr->parameter_number());
+    if (operand->opcode() != HloOpcode::kConstant) {
+      return "";
+    }
+    return stringify_constant(operand);
+  }
+
   std::vector<string> lines;
   for (int64 i = 0; i < instr->operand_count(); ++i) {
     const HloInstruction* operand = instr->operand(i);
     if (operand->opcode() != HloOpcode::kConstant) {
       continue;
     }
-
-    string line = Printf("<b>operand %lld</b> = ", i);
-    if (ShapeUtil::IsEffectiveScalar(operand->shape())) {
-      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
-          operand->shape(), /*linear_index=*/0);
-      Appendf(&line, "%s{%s}", ShapeUtil::HumanString(operand->shape()),
-              operand->literal().GetAsString(elem_idx));
-    } else {
-      if (tensorflow::StringPiece(operand->name()).starts_with("%constant")) {
-        StrAppend(&line, operand->name());
-      } else {
-        StrAppend(&line, "constant ", operand->name());
-      }
-    }
-    lines.push_back(line);
+    lines.push_back(
+        Printf("<b>operand %lld</b> = %s", i, stringify_constant(operand)));
   }
   return Join(lines, "<br/>");
 }
@@ -648,8 +677,7 @@ string HloDotDumper::GetInstructionNodeShape(const HloInstruction* instr) {
 string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
   // If we have a parameter, put the param number in the name.
   if (instr->opcode() == HloOpcode::kParameter) {
-    return Printf("<b>Parameter %lld</b><br/>%s", instr->parameter_number(),
-                  HtmlLikeStringSanitize(instr->name()));
+    return Printf("<b>Parameter %lld</b>", instr->parameter_number());
   }
 
   // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
@@ -735,23 +763,39 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   return Join(lines, "<br/>");
 }
 
-string HloDotDumper::GetInstructionIncomingEdges(const HloInstruction* instr,
-                                                 const NodeFilter& filter) {
-  std::vector<string> edges;
-  for (int64 i = 0; i < instr->operand_count(); ++i) {
-    const HloInstruction* operand = instr->operand(i);
-    if (!filter.Show(operand) || operand->opcode() == HloOpcode::kConstant) {
-      continue;
+void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) {
+  auto add_edge = [&](const HloInstruction* from, const HloInstruction* to,
+                      int64 operand_num) {
+    // Fusion nodes' subcomputations are displayed inline, so if 'from' is a
+    // fusion node and the node's subcomputation is shown, we draw our edge
+    // starting at the fusion node's root instead of at the fusion node itself.
+    if (from->opcode() == HloOpcode::kFusion &&
+        filter_.ShowFusionSubcomputation(from)) {
+      from = from->fused_expression_root();
     }
-    string edge =
-        Printf("%s -> %s", InstructionId(operand), InstructionId(instr));
+    if (!filter_.Show(from) || from->opcode() == HloOpcode::kConstant) {
+      return;
+    }
+    string edge = Printf("%s -> %s", InstructionId(from), InstructionId(to));
     if (instr->operand_count() > 1) {
-      Appendf(&edge, R"( [headlabel="%lld",labeldistance=2])", i);
+      Appendf(&edge, R"( [headlabel="%lld",labeldistance=2])", operand_num);
     }
     StrAppend(&edge, ";");
-    edges.push_back(edge);
+    edges_.push_back(edge);
+  };
+
+  // Add edges from instr's operands to instr.  Parameters within fusion
+  // expressions are handled specially -- we draw an edge from the corresponding
+  // operand on the fusion node itself to the parameter.
+  if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) {
+    const HloInstruction* fusion = instr->fusion_instruction();
+    add_edge(fusion->operand(instr->parameter_number()), instr,
+             /*operand_num=*/0);
+  } else {
+    for (int64 i = 0; i < instr->operand_count(); ++i) {
+      add_edge(instr->operand(i), instr, i);
+    }
   }
-  return Join(edges, "\n");
 }
 
 string HloDotDumper::GetInstructionTrivialComputationStr(
@@ -847,14 +891,6 @@ class FileGraphRenderer : public GraphRendererInterface {
 
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-//
-// It's confusing to draw a node and include only some of its operands.  So if
-// some but not all of a node's operands are <= radius units away from the root,
-// we include the other operands (unless there are a lot of them, as often in a
-// tuple node).  These additional operands may have as inputs other nodes
-// already present in the graph, but we don't draw those edges unless *all* of
-// the inputs are present.  (Otherwise we'd have the same problem we were trying
-// to solve in the first place!)
 NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
@@ -912,48 +948,22 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
     return nodes.count(instr) > 0;
   };
 
-  // If a node has some but not all of its operands omitted, add the operands to
-  // the map with type kOmitNodeOperands.  Unless the node has a lot of
-  // operands, in which case just mark the node as "some operands omitted".
-  std::vector<const HloInstruction*> extra_operands;
+  // Mark nodes which don't have all of their operands present as "some operands
+  // omitted".
   for (auto& kv : nodes) {
     const HloInstruction* instr = kv.first;
     NodeFilterResult& filter_result = kv.second;
     const auto& operands = instr->operands();
 
-    // Mark nodes with many operands and some omitted as "some operands omitted"
-    // and carry on -- don't add their omitted operands to extra_operands.
-    if (operands.size() > 4) {
-      if (std::any_of(operands.begin(), operands.end(), is_displayed) &&
-          !std::all_of(operands.begin(), operands.end(), is_displayed)) {
-        filter_result = kSomeOperandsOmitted;
-      }
-      continue;
-    }
-
-    if (std::any_of(operands.begin(), operands.end(), is_displayed)) {
-      for (const HloInstruction* operand : operands) {
-        if (!is_displayed(operand)) {
-          extra_operands.push_back(operand);
-        }
-      }
-    }
-  }
-  for (const HloInstruction* instr : extra_operands) {
-    nodes[instr] = kOmitNodeOperands;
-  }
-
-  // Some of the nodes in extra_operands may now have all of their inputs
-  // present in nodes.  We can promote these to normal nodes.
-  for (const HloInstruction* instr : extra_operands) {
-    const auto& operands = instr->operands();
-    if (std::all_of(operands.begin(), operands.end(), is_displayed)) {
-      nodes[instr] = kNormalNode;
+    // Mark nodes with some omitted as "some operands omitted".
+    if (std::any_of(operands.begin(), operands.end(), is_displayed) &&
+        !std::all_of(operands.begin(), operands.end(), is_displayed)) {
+      filter_result = kSomeOperandsOmitted;
     }
   }
 
-  // Similarly, promote nodes with type kSomeUsersOmitted to kNormalNode if all
-  // of their users made it into the graph by other means.
+  // Promote nodes with type kSomeUsersOmitted to kNormalNode if all of their
+  // users made it into the graph by other means.
   for (auto& kv : nodes) {
     const auto& users = kv.first->users();
     if (kv.second == kSomeUsersOmitted &&
@@ -980,6 +990,10 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
     if (it != nodes.end()) {
       return it->second;
     }
+    // Show all nodes in subcomputations.
+    if (instr->parent() != root->parent()) {
+      return kNormalNode;
+    }
     return kHideNode;
   });
 }

From 1ee8217f090a06deca7a451c8044530e7398398b Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 25 Jul 2017 10:49:30 -0700
Subject: [PATCH 20/56] errors: Avoid stripping error details when convering
 POSIX errors to Status

This change is made out of a desire to have additional information be reported
when there are filesystem errors (for e.g. see
#11628)

PiperOrigin-RevId: 163091773
---
 tensorflow/core/platform/posix/error.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index df5c8008792..e9baad54226 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -171,11 +171,7 @@ error::Code ErrnoToCode(int err_number) {
 
 Status IOError(const string& context, int err_number) {
   auto code = ErrnoToCode(err_number);
-  if (code == error::UNKNOWN) {
-    return Status(code, strings::StrCat(context, "; ", strerror(err_number)));
-  } else {
-    return Status(code, context);
-  }
+  return Status(code, strings::StrCat(context, "; ", strerror(err_number)));
 }
 
 }  // namespace tensorflow

From 4c9e344bf1b6582620b26c0a62a886d3c80e3c19 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 25 Jul 2017 10:49:37 -0700
Subject: [PATCH 21/56] C API: Fix a bug with TF_OperationGetAttrTensor when
 TF_STRING tensors are involved.

The TensorBuffer owned by a TF_Tensor object has a different memory layout than
the TensorBuffer owned by the corresponding tensorflow::Tensor object.
This change consolidates conversions between the runtime's tensorflow::Tensor
and the C API's TF_Tensor objects into a pair helper functions.

The added test: CApiAttributesTest.StringTensor fails without corresponding
changes to c_api.cc

PiperOrigin-RevId: 163091789
---
 tensorflow/c/c_api.cc      | 134 +++++++++++++++----------------------
 tensorflow/c/c_api_test.cc |  50 +++++++++++---
 2 files changed, 94 insertions(+), 90 deletions(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 3e69134c50a..371264ef6c2 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -56,21 +56,16 @@ limitations under the License.
 
 // The implementation below is at the top level instead of the
 // brain namespace because we are defining 'extern "C"' functions.
-using tensorflow::error::Code;
-using tensorflow::errors::InvalidArgument;
-using tensorflow::gtl::ArraySlice;
-using tensorflow::strings::StrCat;
 using tensorflow::AllocationDescription;
 using tensorflow::DataType;
 using tensorflow::Graph;
 using tensorflow::GraphDef;
-using tensorflow::mutex_lock;
 using tensorflow::NameRangeMap;
 using tensorflow::NameRangesForNode;
 using tensorflow::NewSession;
 using tensorflow::Node;
-using tensorflow::NodeDef;
 using tensorflow::NodeBuilder;
+using tensorflow::NodeDef;
 using tensorflow::OpDef;
 using tensorflow::OpRegistry;
 using tensorflow::PartialTensorShape;
@@ -83,6 +78,11 @@ using tensorflow::TensorBuffer;
 using tensorflow::TensorId;
 using tensorflow::TensorShape;
 using tensorflow::TensorShapeProto;
+using tensorflow::error::Code;
+using tensorflow::errors::InvalidArgument;
+using tensorflow::gtl::ArraySlice;
+using tensorflow::mutex_lock;
+using tensorflow::strings::StrCat;
 
 extern "C" {
 
@@ -258,24 +258,27 @@ size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
   return sz;
 }
 
-size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
-                       size_t* dst_len, TF_Status* status) {
+static Status TF_StringDecode_Impl(const char* src, size_t src_len,
+                                   const char** dst, size_t* dst_len) {
   tensorflow::uint64 len64 = 0;
   const char* p = tensorflow::core::GetVarint64Ptr(src, src + src_len, &len64);
   if (p == nullptr) {
-    status->status =
-        InvalidArgument("invalid string encoding or truncated src buffer");
-    return 0;
+    return InvalidArgument("invalid string encoding or truncated src buffer");
   }
   if (len64 > std::numeric_limits<size_t>::max()) {
-    status->status =
-        InvalidArgument("encoded string is ", len64,
-                        "-bytes, which is too large for this architecture");
-    return 0;
+    return InvalidArgument("encoded string is ", len64,
+                           "-bytes, which is too large for this architecture");
   }
   *dst = p;
   *dst_len = static_cast<size_t>(len64);
-  return static_cast<size_t>(p - src) + *dst_len;
+  return Status::OK();
+}
+
+size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
+                       size_t* dst_len, TF_Status* status) {
+  status->status = TF_StringDecode_Impl(src, src_len, dst, dst_len);
+  if (!status->status.ok()) return 0;
+  return static_cast<size_t>(*dst - src) + *dst_len;
 }
 
 size_t TF_StringEncodedSize(size_t len) {
@@ -391,16 +394,20 @@ void TF_Reset(const TF_SessionOptions* opt, const char** containers,
 
 namespace tensorflow {
 
-// Non-static for testing.
-bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status) {
+Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst) {
+  if (src->dtype != TF_STRING) {
+    *dst = TensorCApi::MakeTensor(src->dtype, src->shape, src->buffer);
+    return Status::OK();
+  }
+  // TF_STRING tensors require copying since Tensor class expects a sequence of
+  // string objects.
   const tensorflow::int64 num_elements = src->shape.num_elements();
   const char* input = reinterpret_cast<const char*>(TF_TensorData(src));
   const size_t src_size = TF_TensorByteSize(src);
   if (static_cast<tensorflow::int64>(src_size / sizeof(tensorflow::uint64)) <
       num_elements) {
-    status->status = InvalidArgument(
+    return InvalidArgument(
         "Malformed TF_STRING tensor; too short to hold number of elements");
-    return false;
   }
   const char* data_start = input + sizeof(tensorflow::uint64) * num_elements;
   const char* limit = input + src_size;
@@ -411,24 +418,30 @@ bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status) {
     tensorflow::uint64 offset =
         reinterpret_cast<const tensorflow::uint64*>(input)[i];
     if (static_cast<ptrdiff_t>(offset) >= (limit - data_start)) {
-      status->status = InvalidArgument("Malformed TF_STRING tensor; element ",
-                                       i, " out of range");
-      return false;
+      return InvalidArgument("Malformed TF_STRING tensor; element ", i,
+                             " out of range");
     }
     size_t len;
     const char* p;
     const char* srcp = data_start + offset;
-    TF_StringDecode(srcp, limit - srcp, &p, &len, status);
-    if (!status->status.ok()) {
-      return false;
-    }
+    Status status = TF_StringDecode_Impl(srcp, limit - srcp, &p, &len);
+    if (!status.ok()) return status;
     dstarray(i).assign(p, len);
   }
-  return true;
+  return Status::OK();
 }
 
 // Non-static for testing.
-TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src) {
+TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src) {
+  if (src.dtype() != DT_STRING) {
+    TensorBuffer* buf = TensorCApi::Buffer(src);
+    buf->Ref();
+    return new TF_Tensor{static_cast<TF_DataType>(src.dtype()), src.shape(),
+                         buf};
+  }
+  // DT_STRING tensors require a copying since TF_Tensor.buffer expects a flatly
+  // encoded sequence of strings.
+
   // Compute bytes needed for encoding.
   size_t size = 0;
   const auto& srcarray = src.flat<tensorflow::string>();
@@ -507,16 +520,8 @@ static bool TF_Run_Inputs(
     TF_Status* status) {
   const int ninputs = input_pairs->size();
   for (int i = 0; i < ninputs; ++i) {
-    TF_Tensor* src = c_inputs[i];
-    if (c_inputs[i]->dtype != TF_STRING) {
-      (*input_pairs)[i].second = tensorflow::TensorCApi::MakeTensor(
-          src->dtype, src->shape, src->buffer);
-    } else if (!tensorflow::TF_Tensor_DecodeStrings(
-                   src, &(*input_pairs)[i].second, status)) {
-      // TF_STRING tensors require copying since Tensor class expects
-      // a sequence of string objects.
-      return false;
-    }
+    status->status = TF_TensorToTensor(c_inputs[i], &(*input_pairs)[i].second);
+    if (!status->status.ok()) return false;
   }
   return true;
 }
@@ -574,15 +579,7 @@ static void TF_Run_Helper(
           static_cast<TF_DataType>(src.dtype()), src.shape());
       continue;
     }
-    if (src.dtype() != tensorflow::DT_STRING) {
-      // Share the underlying buffer.
-      TensorBuffer* buf = tensorflow::TensorCApi::Buffer(src);
-      buf->Ref();
-      c_outputs[i] = new TF_Tensor{static_cast<TF_DataType>(src.dtype()),
-                                   src.shape(), buf};
-    } else {
-      c_outputs[i] = tensorflow::TF_Tensor_EncodeStrings(src);
-    }
+    c_outputs[i] = TF_TensorFromTensor(src);
   }
 }
 
@@ -1062,20 +1059,9 @@ void TF_SetAttrTensorShapeProtoList(TF_OperationDescription* desc,
 
 void TF_SetAttrTensor(TF_OperationDescription* desc, const char* attr_name,
                       TF_Tensor* value, TF_Status* status) {
-  status->status = Status::OK();
   Tensor t;
-  bool ok = true;
-
-  if (value->dtype != TF_STRING) {
-    t = tensorflow::TensorCApi::MakeTensor(value->dtype, value->shape,
-                                           value->buffer);
-  } else {
-    // TF_STRING tensors require copying since Tensor class expects
-    // a sequence of string objects.
-    ok = tensorflow::TF_Tensor_DecodeStrings(value, &t, status);
-  }
-
-  if (ok) desc->node_builder.Attr(attr_name, t);
+  status->status = TF_TensorToTensor(value, &t);
+  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
@@ -1084,21 +1070,14 @@ void TF_SetAttrTensorList(TF_OperationDescription* desc, const char* attr_name,
   status->status = Status::OK();
   std::vector<Tensor> t;
   t.reserve(num_values);
-  bool ok = true;
 
-  for (int i = 0; i < num_values && ok; ++i) {
-    if (values[i]->dtype != TF_STRING) {
-      t.emplace_back(tensorflow::TensorCApi::MakeTensor(
-          values[i]->dtype, values[i]->shape, values[i]->buffer));
-    } else {
-      t.emplace_back(::tensorflow::DT_STRING);
-      // TF_STRING tensors require copying since Tensor class expects
-      // a sequence of string objects.
-      ok = tensorflow::TF_Tensor_DecodeStrings(values[i], &t.back(), status);
-    }
+  for (int i = 0; i < num_values && status->status.ok(); ++i) {
+    Tensor v;
+    status->status = TF_TensorToTensor(values[i], &v);
+    t.emplace_back(v);
   }
 
-  if (ok) desc->node_builder.Attr(attr_name, t);
+  if (status->status.ok()) desc->node_builder.Attr(attr_name, t);
 }
 
 void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
@@ -1555,9 +1534,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
   Tensor t;
   status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
   if (!status->status.ok()) return;
-  *value = new TF_Tensor{static_cast<TF_DataType>(t.dtype()), t.shape(),
-                         tensorflow::TensorCApi::Buffer(t)};
-  (*value)->buffer->Ref();
+  *value = TF_TensorFromTensor(t);
 }
 
 void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
@@ -1568,10 +1545,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
   if (!status->status.ok()) return;
   const auto len = std::min(max_values, static_cast<int>(ts.size()));
   for (int i = 0; i < len; ++i) {
-    const Tensor& t = ts[i];
-    values[i] = new TF_Tensor{static_cast<TF_DataType>(t.dtype()), t.shape(),
-                              tensorflow::TensorCApi::Buffer(t)};
-    values[i]->buffer->Ref();
+    values[i] = TF_TensorFromTensor(ts[i]);
   }
 }
 
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index d6debe3b994..25b6cbd8e7a 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -45,9 +45,8 @@ limitations under the License.
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
-
-bool TF_Tensor_DecodeStrings(TF_Tensor* src, Tensor* dst, TF_Status* status);
-TF_Tensor* TF_Tensor_EncodeStrings(const Tensor& src);
+TF_Tensor* TF_TensorFromTensor(const Tensor& src);
+Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 
 namespace {
 
@@ -146,19 +145,16 @@ void TestEncodeDecode(int line, const std::vector<string>& data) {
     for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) {
       src.flat<string>()(i) = data[i];
     }
-    TF_Tensor* dst = TF_Tensor_EncodeStrings(src);
+    TF_Tensor* dst = TF_TensorFromTensor(src);
 
     // Convert back to a C++ Tensor and ensure we get expected output.
-    TF_Status* status = TF_NewStatus();
     Tensor output;
-    ASSERT_TRUE(TF_Tensor_DecodeStrings(dst, &output, status)) << line;
-    ASSERT_EQ(TF_OK, TF_GetCode(status)) << line;
+    ASSERT_EQ(Status::OK(), TF_TensorToTensor(dst, &output)) << line;
     ASSERT_EQ(src.NumElements(), output.NumElements()) << line;
     for (tensorflow::int64 i = 0; i < src.NumElements(); ++i) {
       ASSERT_EQ(data[i], output.flat<string>()(i)) << line;
     }
 
-    TF_DeleteStatus(status);
     TF_DeleteTensor(dst);
   }
 }
@@ -918,7 +914,7 @@ TEST(CAPI, SavedModel) {
   TF_Operation* input_op =
       TF_GraphOperationByName(graph, input_op_name.c_str());
   ASSERT_TRUE(input_op != nullptr);
-  csession.SetInputs({{input_op, TF_Tensor_EncodeStrings(input)}});
+  csession.SetInputs({{input_op, TF_TensorFromTensor(input)}});
 
   const tensorflow::string output_op_name =
       tensorflow::ParseTensorName(output_name).first.ToString();
@@ -1636,6 +1632,39 @@ TEST_F(CApiAttributesTest, Tensor) {
   TF_DeleteTensor(value);
 }
 
+TEST_F(CApiAttributesTest, StringTensor) {
+  // Create the string-Tensor "atttribute" value.
+  char encoded[] = {
+      0,   0, 0, 0, 0, 0, 0, 0,  // array[uint64] offsets
+      1,                         // varint encoded string length
+      'A',
+  };
+  auto deallocator = [](void* data, size_t len, void* arg) {};
+  unique_tensor_ptr t_in(TF_NewTensor(TF_STRING, nullptr, 0, &encoded[0],
+                                      sizeof(encoded), deallocator, nullptr),
+                         TF_DeleteTensor);
+
+  // Create a TF_Operation with the attribute t_in
+  auto desc = init("tensor");
+  TF_SetAttrTensor(desc, "v", t_in.get(), s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  auto oper = TF_FinishOperation(desc, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Fetch the attribute back.
+  EXPECT_TF_META("v", -1, TF_ATTR_TENSOR, -1);
+  TF_Tensor* t_out = nullptr;
+  TF_OperationGetAttrTensor(oper, "v", &t_out, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  EXPECT_EQ(TF_STRING, TF_TensorType(t_out));
+  EXPECT_EQ(0, TF_NumDims(t_out));
+  ASSERT_EQ(TF_TensorByteSize(t_in.get()), TF_TensorByteSize(t_out));
+  EXPECT_EQ(0, memcmp(TF_TensorData(t_in.get()), TF_TensorData(t_out),
+                      TF_TensorByteSize(t_out)));
+  TF_DeleteTensor(t_out);
+}
+
 TEST_F(CApiAttributesTest, TensorList) {
   const char tensor1[] = {5, 7};
   const int64_t dims1[] = {1, 2};
@@ -1647,7 +1676,8 @@ TEST_F(CApiAttributesTest, TensorList) {
 
   auto desc = init("list(tensor)");
   TF_Tensor* tmp[] = {
-      Int8Tensor(dims1, ndims1, tensor1), Int8Tensor(dims2, ndims2, tensor2),
+      Int8Tensor(dims1, ndims1, tensor1),
+      Int8Tensor(dims2, ndims2, tensor2),
   };
   TF_SetAttrTensorList(desc, "v", tmp, TF_ARRAYSIZE(tmp), s_);
   for (int i = 0; i < TF_ARRAYSIZE(tmp); ++i) {

From 879e207e815eacd0612da068c14f9aaf359a87e3 Mon Sep 17 00:00:00 2001
From: RJ Ryan <rjryan@google.com>
Date: Tue, 25 Jul 2017 10:53:42 -0700
Subject: [PATCH 22/56] Speed up tf.contrib.signal spectral_ops_test.py by
 reducing the size of the gradient test.

PiperOrigin-RevId: 163092423
---
 .../python/kernel_tests/spectral_ops_test.py    | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
index be28184ae64..61b7107a172 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
@@ -220,15 +220,14 @@ class SpectralOpsTest(test.TestCase):
       # stft_bound, inverse_stft_bound).
       # TODO(rjryan): Investigate why STFT gradient error is so high.
       test_configs = [
-          (512, 64, 32, 64, 2e-3, 3e-5),
-          (512, 64, 64, 64, 2e-3, 3e-5),
-          (512, 64, 25, 64, 2e-3, 3e-5),
-          (512, 25, 15, 36, 2e-3, 3e-5),
-          (123, 23, 5, 42, 2e-3, 4e-5),
+          (64, 16, 8, 16),
+          (64, 16, 16, 16),
+          (64, 16, 7, 16),
+          (64, 7, 4, 9),
+          (29, 5, 1, 10),
       ]
 
-      for (signal_length, frame_length, frame_step, fft_length,
-           stft_bound, inverse_stft_bound) in test_configs:
+      for (signal_length, frame_length, frame_step, fft_length) in test_configs:
         signal_shape = [signal_length]
         signal = random_ops.random_uniform(signal_shape)
         stft_shape = [max(0, 1 + (signal_length - frame_length) // frame_step),
@@ -242,8 +241,8 @@ class SpectralOpsTest(test.TestCase):
                                                  stft, stft_shape)
         inverse_stft_error = test.compute_gradient_error(
             stft, stft_shape, inverse_stft, inverse_stft_shape)
-        self.assertLess(stft_error, stft_bound)
-        self.assertLess(inverse_stft_error, inverse_stft_bound)
+        self.assertLess(stft_error, 2e-3)
+        self.assertLess(inverse_stft_error, 4e-5)
 
 
 if __name__ == "__main__":

From 3b9c6a0d9fa4437b86e996016a01011fb72e1729 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 25 Jul 2017 10:57:48 -0700
Subject: [PATCH 23/56] Add new CompareAndBitpackOp.

PiperOrigin-RevId: 163093146
---
 tensorflow/core/kernels/BUILD                 |  34 ++++
 .../core/kernels/compare_and_bitpack_op.cc    | 185 ++++++++++++++++++
 .../core/kernels/compare_and_bitpack_op.h     |  42 ++++
 .../kernels/compare_and_bitpack_op_gpu.cu.cc  | 141 +++++++++++++
 tensorflow/core/ops/math_ops.cc               |  58 ++++++
 tensorflow/python/kernel_tests/BUILD          |  12 ++
 .../compare_and_bitpack_op_test.py            |  83 ++++++++
 7 files changed, 555 insertions(+)
 create mode 100644 tensorflow/core/kernels/compare_and_bitpack_op.cc
 create mode 100644 tensorflow/core/kernels/compare_and_bitpack_op.h
 create mode 100644 tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
 create mode 100644 tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index d35a96a24a4..a493452777f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -701,6 +701,39 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "compare_and_bitpack_op",
+    srcs = ["compare_and_bitpack_op.cc"],
+    hdrs = ["compare_and_bitpack_op.h"],
+    gpu_srcs = [
+        "compare_and_bitpack_op.h",
+        "compare_and_bitpack_op_gpu.cu.cc",
+    ],
+    deps = ARRAY_DEPS,
+)
+
+# TODO(ebrevdo): Add benchmarks once the op is in the autogen array namespace.
+# tf_cuda_cc_test(
+#     name = "compare_and_bitpack_op_test",
+#     srcs = ["compare_and_bitpack_op_test.cc"],
+#     deps = [
+#         ":array",
+#         ":ops_testutil",
+#         ":ops_util",
+#         "//third_party/eigen3",
+#         "//tensorflow/cc:cc_ops",
+#         "//tensorflow/cc:cc_ops_internal",
+#         "//tensorflow/core:core_cpu",
+#         "//tensorflow/core:core_cpu_internal",
+#         "//tensorflow/core:framework",
+#         "//tensorflow/core:lib",
+#         "//tensorflow/core:protos_all_cc",
+#         "//tensorflow/core:test",
+#         "//tensorflow/core:test_main",
+#         "//tensorflow/core:testlib",
+#     ],
+# )
+
 tf_kernel_library(
     name = "reshape_op",
     prefix = "reshape_op",
@@ -2344,6 +2377,7 @@ cc_library(
         ":bucketize_op",
         ":cast_op",
         ":check_numerics_op",
+        ":compare_and_bitpack_op",
         ":cross_op",
         ":cwise_op",
         ":fft_ops",
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
new file mode 100644
index 00000000000..9f626a274a4
--- /dev/null
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -0,0 +1,185 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/compare_and_bitpack_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class CompareAndBitpackOp : public OpKernel {
+ public:
+  explicit CompareAndBitpackOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input_t = c->input(0);
+    const Tensor& threshold_t = c->input(1);
+    OP_REQUIRES(
+        c, TensorShapeUtils::IsScalar(threshold_t.shape()),
+        errors::InvalidArgument("Compare must be a scalar, but saw shape: ",
+                                threshold_t.shape().DebugString()));
+    const TensorShape& input_shape = input_t.shape();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "Input should be at least a vector, but saw a scalar."));
+    OP_REQUIRES(c, input_shape.dim_size(input_shape.dims() - 1) % 8 == 0,
+                errors::InvalidArgument(
+                    "Inner dimension of input should be "
+                    "divisible by ",
+                    8, ", but saw shape: ", input_shape.DebugString()));
+
+    TensorShape output_shape = input_shape;
+    int rank = input_shape.dims();
+    output_shape.set_dim(rank - 1, input_shape.dim_size(rank - 1) / 8);
+
+    Tensor* output_t;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output_t));
+
+    auto input = input_t.flat_inner_dims<T>();
+    auto threshold = threshold_t.scalar<T>();
+    auto output = output_t->flat_inner_dims<uint8>();
+
+    functor::CompareAndBitpack<Device, T> func;
+    func(c, input, threshold, output);
+  }
+};
+
+#define REGISTER_COMPARE_AND_BITPACK(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("CompareAndBitpack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      CompareAndBitpackOp<CPUDevice, type>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_COMPARE_AND_BITPACK);
+TF_CALL_bool(REGISTER_COMPARE_AND_BITPACK);
+
+#undef REGISTER_COMPARE_AND_BITPACK
+
+namespace functor {
+
+template <typename T, class = void, class = void>
+struct ComputeShard {
+  static EIGEN_STRONG_INLINE void Compute(typename TTypes<T>::ConstMatrix input,
+                                          typename TTypes<uint8>::Matrix output,
+                                          const T& thresh, int64 start,
+                                          int64 limit) {
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const T* block = input.data() + 8 * i;
+      *out = ((((block[0] > thresh) << 7)) | (((block[1] > thresh) << 6)) |
+              (((block[2] > thresh) << 5)) | (((block[3] > thresh) << 4)) |
+              (((block[4] > thresh) << 3)) | (((block[5] > thresh) << 2)) |
+              (((block[6] > thresh) << 1)) | (((block[7] > thresh))));
+    }
+  }
+};
+
+// Specialization for bool on systems where sizeof(bool) == 1.
+template <typename T>
+struct ComputeShard<T,
+                    typename std::enable_if<std::is_same<T, bool>::value>::type,
+                    typename std::enable_if<sizeof(T) == 1>::type> {
+  static EIGEN_STRONG_INLINE void Compute(
+      typename TTypes<bool>::ConstMatrix input,
+      typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
+      int64 limit) {
+    // NOTE(ebrevdo): This assumes memory is little-endian.
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+      *out =
+          ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) |
+           (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) |
+           (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) |
+           (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) |
+           (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) |
+           (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
+           (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
+    }
+  }
+};
+
+template <typename T>
+struct CompareAndBitpack<CPUDevice, T> {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstMatrix input,
+                  typename TTypes<T>::ConstScalar threshold,
+                  TTypes<uint8>::Matrix output) {
+    const T thresh = threshold();
+    auto shard = [&, thresh](int64 start, int64 limit) {
+      ComputeShard<T>::Compute(input, output, thresh, start, limit);
+    };
+    int64 total_shards = output.size();  // Approximate cmp as an add and
+                                         // bitwise-or + shift as an add.
+    const double total_cost = 8 * (Eigen::TensorOpCost::AddCost<T>() +
+                                   Eigen::TensorOpCost::AddCost<uint8>());
+    const int64 shard_cost = (total_cost >= static_cast<double>(kint64max))
+                                 ? kint64max
+                                 : static_cast<int64>(total_cost);
+
+    auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
+          shard_cost, shard);
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+
+#define REGISTER_COMPARE_AND_BITPACK(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("CompareAndBitpack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      CompareAndBitpackOp<GPUDevice, type>);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_COMPARE_AND_BITPACK);
+TF_CALL_bool(REGISTER_COMPARE_AND_BITPACK);
+
+#undef REGISTER_COMPARE_AND_BITPACK
+
+namespace functor {
+
+#define DECLARE_GPU_SPEC(T)                                      \
+  template <>                                                    \
+  void CompareAndBitpack<GPUDevice, T>::operator()(              \
+      OpKernelContext* c, typename TTypes<T>::ConstMatrix input, \
+      typename TTypes<T>::ConstScalar threshold,                 \
+      TTypes<uint8>::Matrix output);                             \
+  extern template struct CompareAndBitpack<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC)
+TF_CALL_bool(DECLARE_GPU_SPEC)
+
+#undef DECLARE_GPU_SPEC
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.h b/tensorflow/core/kernels/compare_and_bitpack_op.h
new file mode 100644
index 00000000000..8e020249c10
--- /dev/null
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct CompareAndBitpack {
+  void operator()(OpKernelContext* c, typename TTypes<T>::ConstMatrix input,
+                  typename TTypes<T>::ConstScalar threshold,
+                  TTypes<uint8>::Matrix output);
+};
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
new file mode 100644
index 00000000000..345405e3fe6
--- /dev/null
+++ b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/compare_and_bitpack_op.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+__global__ void CompareAndBitpackKernel(const int size, const T* threshold,
+                                        const T* input, uint8* output) {
+  // TODO(ebrevdo): Erich said: to get a better memory access pattern
+  // you could have 8 threads load this data and do a comparison, then
+  // use the ballot instruction to combine the values from each thread
+  // in the warp in one instruction (so each thread will have the
+  // result for 4 blocks) followed by an appropriate shift and mask to
+  // get the 8-bits of interest.
+  const T thresh = ldg(threshold);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const T* block = input + 8 * i;
+    output[i] =
+        ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) |
+         (((ldg(block + 2) > thresh) << 5)) |
+         (((ldg(block + 3) > thresh) << 4)) |
+         (((ldg(block + 4) > thresh) << 3)) |
+         (((ldg(block + 5) > thresh) << 2)) |
+         (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh))));
+  }
+}
+
+template <>
+__global__ void CompareAndBitpackKernel<bool>(const int size,
+                                              const bool* threshold,
+                                              const bool* input,
+                                              uint8* output) {
+  // TODO(ebrevdo): Erich said: I think you could again have multiple
+  // threads work on one block and use the ballot instruction to the
+  // bit packing in one instruction.
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i));
+    // NOTE(ebrevdo): This assumes memory is little-endian.
+    output[i] =
+        ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) |
+         (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) |
+         (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) |
+         (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) |
+         (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) |
+         (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
+         (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
+  }
+}
+
+template <>
+__global__ void CompareAndBitpackKernel<float>(const int size,
+                                               const float* threshold,
+                                               const float* input,
+                                               uint8* output) {
+  const float thresh = ldg(threshold);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i));
+    const float4 block1 =
+        ldg(reinterpret_cast<const float4*>(input + 8 * i + 4));
+    output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) |
+                 (((block0.z > thresh) << 5)) | (((block0.w > thresh) << 4)) |
+                 (((block1.x > thresh) << 3)) | (((block1.y > thresh) << 2)) |
+                 (((block1.z > thresh) << 1)) | (((block1.w > thresh))));
+  }
+}
+
+template <>
+__global__ void CompareAndBitpackKernel<double>(const int size,
+                                                const double* threshold,
+                                                const double* input,
+                                                uint8* output) {
+  const double thresh = ldg(threshold);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i));
+    const double2 block1 =
+        ldg(reinterpret_cast<const double2*>(input + 8 * i + 2));
+    const double2 block2 =
+        ldg(reinterpret_cast<const double2*>(input + 8 * i + 4));
+    const double2 block3 =
+        ldg(reinterpret_cast<const double2*>(input + 8 * i + 6));
+    output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) |
+                 (((block1.x > thresh) << 5)) | (((block1.y > thresh) << 4)) |
+                 (((block2.x > thresh) << 3)) | (((block2.y > thresh) << 2)) |
+                 (((block3.x > thresh) << 1)) | (((block3.y > thresh))));
+  }
+}
+
+#define DEFINE_GPU_SPECS(T)                                               \
+  template <>                                                             \
+  void CompareAndBitpack<GPUDevice, T>::operator()(                       \
+      OpKernelContext* c, typename TTypes<T>::ConstMatrix input,          \
+      typename TTypes<T>::ConstScalar threshold,                          \
+      TTypes<uint8>::Matrix output) {                                     \
+    const GPUDevice& d = c->eigen_device<GPUDevice>();                    \
+    int64 total_count = output.size();                                    \
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);        \
+                                                                          \
+    CompareAndBitpackKernel<T>                                            \
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \
+            total_count, threshold.data(), input.data(), output.data());  \
+  }
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS)
+TF_CALL_bool(DEFINE_GPU_SPECS)
+
+#undef DECLARE_GPU_SPECS
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index f4e0625c66d..36f999ff607 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -2458,6 +2458,64 @@ out_type: The type of the output. Should be a lower bit depth than Tinput.
 
 )doc");
 
+REGISTER_OP("CompareAndBitpack")
+    .Input("input: T")
+    .Input("threshold: T")
+    .Output("output: uint8")
+    .Attr("T: {bool, float16, float32, float64, int8, int16, int32, int64}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &input));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      ShapeHandle output = input;
+      if (c->RankKnown(input)) {
+        int rank = c->Rank(input);
+        auto inner_dim = c->Dim(input, rank - 1);
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(inner_dim, 8,
+                                     /* evenly_divisible */ true,
+                                     &inferred_dim));
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(output, rank - 1, inferred_dim, &output));
+      }
+      c->set_output(0, output);
+
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+
+Each comparison returns a boolean `true` (if `input_value > threshold`)
+or and `false` otherwise.
+
+This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+algorithms that use hashing approximations of cosine and `L2` distances;
+codes can be generated from an input via:
+
+```python
+codebook_size = 50
+codebook_bits = codebook_size * 32
+codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+                           dtype=x.dtype,
+                           initializer=tf.orthogonal_initializer())
+codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+# now codes has shape x.shape[:-1] + [codebook_size]
+```
+
+**NOTE**: Currently, the innermost dimension of the tensor must be divisible
+by 8.
+
+Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+
+input: Values to compare against `threshold` and bitpack.
+threshold: Threshold to compare against.
+T: The type of the input and threshold.
+output: The bitpacked comparisons.
+)doc");
+
 REGISTER_OP("RequantizationRange")
     .Input("input: Tinput")
     .Input("input_min: float")
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index cac05c372a3..896d466c259 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1671,6 +1671,18 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "compare_and_bitpack_op_test",
+    size = "small",
+    srcs = ["compare_and_bitpack_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
 cuda_py_test(
     name = "scalar_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
new file mode 100644
index 00000000000..56ddd6e4282
--- /dev/null
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -0,0 +1,83 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.compare_and_bitpack_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class CompareAndBitpackTest(test.TestCase):
+
+  def _testCompareAndBitpack(self,
+                             x, threshold,
+                             truth,
+                             expected_err_re=None):
+    with self.test_session(use_gpu=True):
+      ans = math_ops.compare_and_bitpack(x, threshold)
+      if expected_err_re is None:
+        tf_ans = ans.eval()
+        self.assertShapeEqual(truth, ans)
+        self.assertAllEqual(tf_ans, truth)
+      else:
+        with self.assertRaisesOpError(expected_err_re):
+          ans.eval()
+
+  def _testBasic(self, dtype):
+    rows = 371
+    cols = 294
+    x = np.random.randn(rows, cols * 8)
+    if dtype == np.bool:
+      x = x > 0
+    else:
+      x = x.astype(dtype)
+    threshold = dtype(0)
+    # np.packbits flattens the tensor, so we reshape it back to the
+    # expected dimensions.
+    truth = np.packbits(x > threshold).reshape(rows, cols)
+    self._testCompareAndBitpack(x, threshold, truth)
+
+  def testBasicFloat32(self):
+    self._testBasic(np.float32)
+
+  def testBasicFloat64(self):
+    self._testBasic(np.float64)
+
+  def testBasicFloat16(self):
+    self._testBasic(np.float16)
+
+  def testBasicBool(self):
+    self._testBasic(np.bool)
+
+  def testBasicInt8(self):
+    self._testBasic(np.int8)
+
+  def testBasicInt16(self):
+    self._testBasic(np.int16)
+
+  def testBasicInt32(self):
+    self._testBasic(np.int32)
+
+  def testBasicInt64(self):
+    self._testBasic(np.int64)
+
+
+if __name__ == "__main__":
+  test.main()

From f8ecc882508fc4807d106cd247e558a2d33a128a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 11:05:24 -0700
Subject: [PATCH 24/56] Update ops-related pbtxt files.

PiperOrigin-RevId: 163094455
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 56 ++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 64 +++++++++++++++++++
 2 files changed, 120 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 94224f22b9c..b82035bfc32 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -4597,6 +4597,37 @@ op {
     }
   }
 }
+op {
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Complex"
   input_arg {
@@ -16267,6 +16298,31 @@ op {
     }
   }
 }
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
 op {
   name: "Pow"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index ab984d2547f..468434bd283 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4406,6 +4406,43 @@ op {
   summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
   description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
+op {
+  name: "CompareAndBitpack"
+  input_arg {
+    name: "input"
+    description: "Values to compare against `threshold` and bitpack."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "threshold"
+    description: "Threshold to compare against."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "The bitpacked comparisons."
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "The type of the input and threshold."
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Compare values of `input` to `threshold` and pack resulting bits into a `uint8`."
+  description: "Each comparison returns a boolean `true` (if `input_value > threshold`)\nor and `false` otherwise.\n\nThis operation is useful for Locality-Sensitive-Hashing (LSH) and other\nalgorithms that use hashing approximations of cosine and `L2` distances;\ncodes can be generated from an input via:\n\n```python\ncodebook_size = 50\ncodebook_bits = codebook_size * 32\ncodebook = tf.get_variable(\'codebook\', [x.shape[-1].value, codebook_bits],\n                           dtype=x.dtype,\n                           initializer=tf.orthogonal_initializer())\ncodes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)\ncodes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32\n# now codes has shape x.shape[:-1] + [codebook_size]\n```\n\n**NOTE**: Currently, the innermost dimension of the tensor must be divisible\nby 8.\n\nGiven an `input` shaped `[s0, s1, ..., s_n]`, the output is\na `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`."
+}
 op {
   name: "Complex"
   input_arg {
@@ -15444,6 +15481,33 @@ op {
   summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
   description: "The polygamma function is defined as:\n\n\n\\\\(\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\\\\)\n\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
 }
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  summary: "Computes element-wise population count (a.k.a. popcount, bitsum, bitcount)."
+  description: "For each entry in `x`, calculates the number of `1` (on) bits in the binary\nrepresentation of that entry.\n\n**NOTE**: It is more efficient to first `tf.bitcast` your tensors into\n`int32` or `int64` and perform the bitcount on the result, than to feed in\n8- or 16-bit inputs and then aggregate the resulting counts."
+}
 op {
   name: "Pow"
   input_arg {

From e8e406f64674a52619767e9715d218d4c00ff7ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 11:46:49 -0700
Subject: [PATCH 25/56] Minor tweaks to avoid unnecessary copies

PiperOrigin-RevId: 163101160
---
 tensorflow/cc/framework/gradients.cc         | 4 ++--
 tensorflow/core/framework/attr_value_util.cc | 6 ++++--
 tensorflow/core/framework/attr_value_util.h  | 6 ++++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index cec3ebc0ad6..66a943410e2 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -356,7 +356,7 @@ Status SymbolicGradientBuilder::AddGradients() {
   // Check if any input nodes still have pending gradients and have not been
   // processed yet. This happens if not all outputs of a node are in 'inputs_'.
   std::unordered_map<Node*, int> requested_grads;
-  for (Output nout : inputs_) {
+  for (const Output& nout : inputs_) {
     if (pending_[nout.node()->id()] > 0) {
       DCHECK_GT(nout.node()->num_outputs(), 1);
       int idx = input_nodes_[nout];
@@ -365,7 +365,7 @@ Status SymbolicGradientBuilder::AddGradients() {
       ++requested_grads[nout.node()];
     }
   }
-  for (auto& p : requested_grads) {
+  for (const auto& p : requested_grads) {
     int num_requested_inputs = p.first->num_outputs() - pending_[p.first->id()];
     CHECK_EQ(num_requested_inputs, p.second);
   }
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 9fdb3da6a0d..95cafa24b19 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 
+#include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb_text.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -27,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
-
 namespace {
 
 string SummarizeString(const string& str) {
@@ -460,7 +461,8 @@ bool HasPlaceHolder(const AttrValue& val) {
   return false;
 }
 
-bool SubstitutePlaceholders(SubstituteFunc substitute, AttrValue* value) {
+bool SubstitutePlaceholders(const SubstituteFunc& substitute,
+                            AttrValue* value) {
   switch (value->value_case()) {
     case AttrValue::kList: {
       for (NameAttrList& func : *value->mutable_list()->mutable_func()) {
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 08cc3b7158e..08d813bb6f9 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_
 #define TENSORFLOW_FRAMEWORK_ATTR_VALUE_UTIL_H_
 
+#include <functional>
 #include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb.h"  // TODO(62899350): Remove
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -100,8 +102,8 @@ bool HasPlaceHolder(const AttrValue& val);
 // SubstituteFunc is given a placeholder string. If the placeholder is
 // unknown, SubstituteFunc returns false. Otherwise, overwrites the
 // attr value and returns true.
-typedef std::function<bool(const string&, AttrValue*)> SubstituteFunc;
-bool SubstitutePlaceholders(SubstituteFunc substitute, AttrValue* value);
+using SubstituteFunc = std::function<bool(const string&, AttrValue*)>;
+bool SubstitutePlaceholders(const SubstituteFunc& substitute, AttrValue* value);
 
 }  // namespace tensorflow
 

From ac209ebc8fc8780bb3121a33740e10a34352996f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 11:49:58 -0700
Subject: [PATCH 26/56] [BatchNormGrad] Add end-to-end test for BatchNormGrad

RELNOTES: n/a
PiperOrigin-RevId: 163101568
---
 .../xla/tests/batch_normalization_test.cc     | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 074e28cec77..d692a810325 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -308,6 +308,137 @@ XLA_TEST_P(BatchNormTest, DISABLED_ON_GPU(RandomizedTests)) {
       ErrorSpec(0.01, 1));
 }
 
+// TODO(b/62764704): Implement on GPU. Disabled on 2017-06-20.
+XLA_TEST_P(BatchNormTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(
+                              DISABLED_ON_GPU(RandomizedGradTests)))) {
+  float epsilon = 0.001;
+  ComputationBuilder builder(client_, TestName());
+  const std::vector<int64>& bounds = GetParam().bounds;
+  Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
+  input_array.FillRandom(GetParam().random_value_var,
+                         GetParam().random_value_mean);
+
+  Array4D<float> grad_output_array(bounds[0], bounds[1], bounds[2], bounds[3]);
+  grad_output_array.FillRandom(GetParam().random_value_var,
+                               GetParam().random_value_mean);
+
+  const int64 feature_index = GetParam().feature_index;
+  const int64 num_elements_per_feature =
+      Product(bounds) / bounds[feature_index];
+  const int64 feature_bound = bounds[feature_index];
+  std::vector<float> scale(feature_bound, 2);
+
+  auto input_squared =
+      ReferenceUtil::MapArray4D(input_array, [](float a) { return a * a; });
+  std::vector<int64> reduce_dims;
+  for (int64 i = 0; i < bounds.size(); ++i) {
+    if (i != feature_index) {
+      reduce_dims.push_back(i);
+    }
+  }
+
+  auto sum =
+      ReferenceUtil::Reduce4DTo1D(input_array, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  auto sum_squared =
+      ReferenceUtil::Reduce4DTo1D(*input_squared, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  std::vector<float> mean(feature_bound);
+
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean[i] = sum[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> mean_square(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    mean_square[i] = mean[i] * mean[i];
+  }
+
+  std::vector<float> square_mean(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    square_mean[i] = sum_squared[i] / num_elements_per_feature;
+  }
+
+  std::vector<float> var(feature_bound);
+  for (int64 i = 0; i < feature_bound; ++i) {
+    var[i] = square_mean[i] - mean_square[i];
+  }
+
+  Array4D<float> mean_4D =
+      *ReferenceUtil::Broadcast1DTo4D(mean, bounds, feature_index);
+  auto var_4D = *ReferenceUtil::Broadcast1DTo4D(var, bounds, feature_index);
+  auto scale_4D = *ReferenceUtil::Broadcast1DTo4D(scale, bounds, feature_index);
+
+  auto var_add_epsilon = *ReferenceUtil::MapArray4D(
+      var_4D, [epsilon](float a) { return std::sqrt(a + epsilon); });
+
+  auto grad_output_times_var =
+      *ReferenceUtil::MapArray4D(grad_output_array, var_add_epsilon,
+                                 [](float a, float b) { return a * b; });
+
+  auto grad_activation = *ReferenceUtil::MapArray4D(
+      grad_output_times_var, scale_4D, [](float a, float b) { return a * b; });
+
+  auto activation_shifted = *ReferenceUtil::MapArray4D(
+      input_array, mean_4D, [](float a, float b) { return a - b; });
+
+  auto grad_scale_before_reduction =
+      *ReferenceUtil::MapArray4D(grad_output_times_var, activation_shifted,
+                                 [](float a, float b) { return a * b; });
+
+  auto grad_scale = ReferenceUtil::Reduce4DTo1D(
+      grad_scale_before_reduction, /*init=*/0.0f, reduce_dims,
+      [](float a, float b) { return a + b; });
+
+  auto grad_offset =
+      ReferenceUtil::Reduce4DTo1D(grad_output_array, /*init=*/0.0f, reduce_dims,
+                                  [](float a, float b) { return a + b; });
+
+  auto expected_grad_activation =
+      Literal::CreateR4FromArray4D<float>(grad_activation);
+
+  auto input_literal = Literal::CreateR4FromArray4D<float>(input_array);
+  auto scale_literal = Literal::CreateR1<float>(scale);
+  auto mean_literal = Literal::CreateR1<float>(mean);
+  auto var_literal = Literal::CreateR1<float>(var);
+  auto grad_output_literal =
+      Literal::CreateR4FromArray4D<float>(grad_output_array);
+
+  auto input_parameter = builder.Parameter(0, input_literal->shape(), "input");
+  auto scale_parameter = builder.Parameter(1, scale_literal->shape(), "scale");
+  auto mean_parameter = builder.Parameter(2, mean_literal->shape(), "mean");
+  auto var_parameter = builder.Parameter(3, var_literal->shape(), "variance");
+  auto grad_output_parameter =
+      builder.Parameter(4, grad_output_literal->shape(), "grad_output");
+
+  std::unique_ptr<GlobalData> input_data =
+      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> scale_data =
+      client_->TransferToServer(*scale_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> mean_data =
+      client_->TransferToServer(*mean_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> var_data =
+      client_->TransferToServer(*var_literal).ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> grad_output_data =
+      client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie();
+
+  auto t = builder.BatchNormGrad(input_parameter, scale_parameter,
+                                 mean_parameter, var_parameter,
+                                 grad_output_parameter, epsilon, feature_index);
+
+  auto expected =
+      *Literal::MakeTuple({expected_grad_activation.get(),
+                           Literal::CreateR1<float>(grad_scale).get(),
+                           Literal::CreateR1<float>(grad_offset).get()});
+
+  ComputeAndCompareTuple(&builder, expected,
+                         {input_data.get(), scale_data.get(), mean_data.get(),
+                          var_data.get(), grad_output_data.get()},
+                         ErrorSpec(0.01, 1));
+}
+
 INSTANTIATE_TEST_CASE_P(
     BatchNormTest_Instantiation, BatchNormTest,
     ::testing::Values(BatchNormTestParam{{2, 2, 2, 2}, 0, 100.2f, 200.0f},
@@ -319,6 +450,7 @@ INSTANTIATE_TEST_CASE_P(
                       BatchNormTestParam{{10, 10, 10, 10}, 1, -666.6f, 777.7f},
                       BatchNormTestParam{{10, 10, 10, 10}, 2, 0.f, 777.7f},
                       BatchNormTestParam{{1, 1, 10, 130}, 2, 0.f, 777.7f},
+                      BatchNormTestParam{{1, 1, 130, 11}, 2, 0.f, 777.7f},
                       BatchNormTestParam{{1, 1, 10, 1}, 3, 888.8f, 9.9f},
 
                       BatchNormTestParam{{24, 129, 1, 2}, 2, 10000, 10000},
@@ -446,6 +578,37 @@ XLA_TEST_F(BatchNormTest, DISABLED_ON_GPU(LargeEpsilonTest)) {
                          ErrorSpec(0.1));
 }
 
+// TODO(b/62764704): Implement on CPU and GPU. Disabled on 2017-07-11.
+XLA_TEST_F(BatchNormTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(
+                              DISABLED_ON_GPU(BatchNormGradBasic)))) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand =
+      builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
+
+  auto scale = builder.ConstantR1<float>({1.0f, 1.0f});
+
+  auto mean = builder.ConstantR1<float>({0.0f, 0.0f});
+
+  auto var = builder.ConstantR1<float>({1.0f, 1.0f});
+
+  auto grad_output = builder.ConstantR4FromArray4D<float>(
+      {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
+
+  builder.BatchNormGrad(operand, scale, mean, var, grad_output,
+                        /*epsilon=*/0.0, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<float>(
+           {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}})
+           .get(),
+       Literal::CreateR1<float>({0, 0}).get(),
+       Literal::CreateR1<float>({16, 20}).get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1));
+}
+
 }  // namespace
 }  // namespace xla
 

From 53019b0050d441ef053c080e6bcd8f210d8250ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 11:53:45 -0700
Subject: [PATCH 27/56] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 163102070
---
 tensorflow/go/op/wrappers.go | 177 ++++++++++++++++++++++++-----------
 1 file changed, 122 insertions(+), 55 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 61c924ac22b..095cbbe637b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -7070,6 +7070,61 @@ func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_ha
 	return op.Output(0)
 }
 
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+//
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
+	}
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
+//
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes rectified linear 6: `min(max(features, 0), 6)`.
 func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -14495,61 +14550,6 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
-//
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Component-wise multiplies a SparseTensor by a dense Tensor.
 //
 // The output locations corresponding to the implicitly zero elements in the sparse
@@ -17918,6 +17918,28 @@ func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+//
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
+//
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "PopulationCount",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AssertAttr is an optional argument to Assert.
 type AssertAttr func(optionalAttr)
 
@@ -23919,6 +23941,51 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+//
+// Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
+//
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompareAndBitpack",
+		Input: []tf.Input{
+			input, threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
 // Arguments:

From 2628455c6a8fb29839048ece8a211055b73b90e1 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 25 Jul 2017 11:56:42 -0700
Subject: [PATCH 28/56] [XLA] Add more unit tests for DynamicSlice and
 DynamicUpdateSlice.

PiperOrigin-RevId: 163102445
---
 .../compiler/xla/tests/dynamic_ops_test.cc    | 405 +++++++++++-------
 1 file changed, 239 insertions(+), 166 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 576c1c703df..9e85e357070 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -44,295 +44,310 @@ namespace {
 
 class DynamicSliceTest : public ClientLibraryTestBase {
  protected:
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR1() {
     // Slice at dimension start.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {0}, {5},
-                  {0.0, 1.0, 2.0, 3.0, 4.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {0}, {5}, {0, 1, 2, 3, 4});
     // Slice in the middle.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {3},
-                  {2.0, 3.0, 4.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {3}, {2, 3, 4});
     // Slice at dimension boundaries.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {5}, {3},
-                  {5.0, 6.0, 7.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {5}, {3}, {5, 6, 7});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {6}, {4},
-                  {6.0, 7.0, 0.0, 1.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {6, 7, 0, 1});
     // Zero element slice.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {0}, {});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {0}, {});
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR2() {
     // Slice at dimension start.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {0, 0}, {2, 2}, {{1.0f, 2.0f}, {4.0f, 5.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {2, 2},
+                         {{1, 2}, {4, 5}});
     // Slice in the middle.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {1, 1}, {2, 1}, {{5.0f}, {8.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {2, 1},
+                         {{5}, {8}});
     // Slice at dimension boundaries.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {1, 1}, {2, 1}, {{5.0f}, {8.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {2, 1},
+                         {{5}, {8}});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {1, 1}, {3, 3},
-                  {{5.0f, 6.0f, 4.0f}, {8.0f, 9.0f, 7.0f}, {2.0f, 3.0f, 1.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {3, 3},
+                         {{5, 6, 4}, {8, 9, 7}, {2, 3, 1}});
     // Zero element slice: 2x0.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {0, 0}, {2, 0}, {{}, {}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {2, 0},
+                         {{}, {}});
     // Zero element slice: 0x2.
-    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-                  {0, 0}, {0, 2}, Array2D<float>(0, 2));
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {0, 2},
+                         Array2D<DataT>(0, 2));
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR3() {
     // R3 Shape: [2, 3, 2]
     // clang-format off
 
     // Slice at dimension start.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-        {0, 0, 0}, {2, 1, 2},
-      {{{1.0f, 2.0f}}, {{7.0f, 8.0f}}});
+    RunR3<IndexT, DataT>(
+      {{{1, 2}, {3, 4}, {5, 6}},
+       {{7, 8}, {9, 10}, {11, 12}}},
+      {0, 0, 0}, {2, 1, 2},
+      {{{1, 2}}, {{7, 8}}});
 
     // Slice in the middle.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-        {0, 1, 1}, {2, 2, 1},
-      {{{4.0f}, {6.0f}}, {{10.0f}, {12.0f}}});
+    RunR3<IndexT, DataT>(
+      {{{1, 2}, {3, 4}, {5, 6}},
+       {{7, 8}, {9, 10}, {11, 12}}},
+      {0, 1, 1}, {2, 2, 1},
+      {{{4}, {6}}, {{10}, {12}}});
 
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-        {0, 2, 1}, {2, 1, 2},
-      {{{6.0f, 5.0f}}, {{12.0f, 11.0f}}});
+    RunR3<IndexT, DataT>(
+      {{{1, 2}, {3, 4}, {5, 6}},
+       {{7, 8}, {9, 10}, {11, 12}}},
+      {0, 2, 1}, {2, 1, 2},
+      {{{6, 5}}, {{12, 11}}});
 
     // clang-format on
   }
 
-  template <typename IndexT>
-  void RunR1(const std::vector<float>& input_values,
+  template <typename IndexT, typename DataT>
+  void RunR1(tensorflow::gtl::ArraySlice<DataT> input_values,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const std::vector<float>& expected_values) {
+             tensorflow::gtl::ArraySlice<DataT> expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR1<float>(input_values);
+    auto input = builder.ConstantR1<DataT>(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR1<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR1<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR2(const Array2D<float>& input_values,
+  template <typename IndexT, typename DataT>
+  void RunR2(const Array2D<DataT>& input_values,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const Array2D<float>& expected_values) {
+             const Array2D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR2FromArray2D<float>(input_values);
+    auto input = builder.ConstantR2FromArray2D<DataT>(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR2<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR2<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR3(const Array3D<float>& input_values,
+  template <typename IndexT, typename DataT>
+  void RunR3(const Array3D<DataT>& input_values,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const Array3D<float>& expected_values) {
+             const Array3D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR3FromArray3D<float>(input_values);
+    auto input = builder.ConstantR3FromArray3D<DataT>(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR3<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR3<DataT>(&builder, expected_values, {start_data.get()});
   }
 };
 
-XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32, int32>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64>(); }
+XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64, float>(); }
 
-XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64>(); }
+XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64, double>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32, float>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64>(); }
+XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64, double>(); }
 
-XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64>(); }
+XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32, int32>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64>(); }
+XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64, float>(); }
 
-XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64>(); }
+XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64, double>(); }
+
+XLA_TEST_F(DynamicSliceTest, Int32R1Pred) {
+  // Slice at dimension start.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {0},
+                     {5}, {true, false, false, true, false});
+  // Slice in the middle.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {2},
+                     {3}, {false, true, false});
+  // Slice at dimension boundaries.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {5},
+                     {3}, {true, true, false});
+  // Zero element slice.
+  RunR1<int32, bool>({true, false, false, true, false, true, true, false}, {2},
+                     {0}, {});
+}
+
+XLA_TEST_F(DynamicSliceTest, Int32R2Pred) {
+  // Slice at dimension start.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
+      {2, 2}, {{true, false}, {false, false}});
+  // Slice in the middle.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {1, 1},
+      {2, 1}, {{false}, {true}});
+  // Slice at dimension boundaries.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {1, 1},
+      {2, 1}, {{false}, {true}});
+  // Zero element slice: 2x0.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
+      {2, 0}, {{}, {}});
+  // Zero element slice: 0x2.
+  RunR2<int32, bool>(
+      {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
+      {0, 2}, Array2D<bool>(0, 2));
+}
+
+XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
+  // R3 Shape: [2, 3, 2]
+  // clang-format off
+
+  // Slice at dimension start.
+  RunR3<int32, bool>(
+    {{{true, false}, {false, true}, {true, true}},
+     {{false, true}, {true, false}, {false, false}}},
+    {0, 0, 0}, {2, 1, 2},
+    {{{true, false}}, {{false, true}}});
+
+  // Slice in the middle.
+  RunR3<int32, bool>(
+    {{{true, false}, {false, true}, {true, true}},
+     {{false, true}, {true, false}, {false, false}}},
+    {0, 1, 1}, {2, 2, 1},
+    {{{true}, {true}}, {{false}, {false}}});
+
+  // clang-format on
+}
 
 class DynamicUpdateSliceTest : public ClientLibraryTestBase {
  protected:
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR1() {
-    // clang-format off
     // Slice at dimension start.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {0},
-                  {8.0, 9.0, 10.0, 3.0, 4.0, 5.0, 6.0, 7.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {0},
+                         {8, 9, 10, 3, 4, 5, 6, 7});
     // Slice in the middle.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {2},
-                  {0.0, 1.0, 8.0, 9.0, 10.0, 5.0, 6.0, 7.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {2},
+                         {0, 1, 8, 9, 10, 5, 6, 7});
     // Slice at dimension boundaries.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {5},
-                  {0.0, 1.0, 2.0, 3.0, 4.0, 8.0, 9.0, 10.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {5},
+                         {0, 1, 2, 3, 4, 8, 9, 10});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {8.0, 9.0, 10.0}, {6},
-                  {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0, 9.0});
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {6},
+                         {0, 1, 2, 3, 4, 5, 8, 9});
     // Zero-sized update.
-    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
-                  {}, {2},
-                  {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
-    // clang-format on
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {}, {2},
+                         {0, 1, 2, 3, 4, 5, 6, 7});
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR2() {
-    // clang-format off
     // Slice at dimension start.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {0, 0},
-        {{10.0f, 11.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {0, 0},
+                         {{10, 11, 3}, {4, 5, 6}, {7, 8, 9}});
     // Slice in the middle.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {1, 1},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 10.0f, 11.0f}, {7.0f, 8.0f, 9.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {1, 1},
+                         {{1, 2, 3}, {4, 10, 11}, {7, 8, 9}});
     // Slice at dimension boundaries.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {2, 1},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 10.0f, 11.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 1},
+                         {{1, 2, 3}, {4, 5, 6}, {7, 10, 11}});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{10.0f, 11.0f}}, {2, 2},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 10.0f}});
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 2},
+                         {{1, 2, 3}, {4, 5, 6}, {7, 8, 10}});
     // Zero-sized update.
-    RunR2<IndexT>(
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
-        {{}}, {2, 1},
-        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
-    // clang-format on
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{}}, {2, 1},
+                         {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   }
 
-  template <typename IndexT>
+  template <typename IndexT, typename DataT>
   void TestR3() {
     // R3 Shape: [2, 3, 2]
-    // clang-format off
     // Slice at dimension start.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-      {{{13.0f, 14.0f}, {15.0f, 16.0f}},
-       {{17.0f, 18.0f}, {19.0f, 20.0f}}},
-        {0, 0, 0},
-      {{{13.0f, 14.0f}, {15.0f, 16.0f}, {5.0f, 6.0f}},
-       {{17.0f, 18.0f}, {19.0f, 20.0f}, {11.0f, 12.0f}}});
+    RunR3<IndexT, DataT>(
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}},
+        {{{13, 14}, {15, 16}}, {{17, 18}, {19, 20}}}, {0, 0, 0},
+        {{{13, 14}, {15, 16}, {5, 6}}, {{17, 18}, {19, 20}, {11, 12}}});
     // Slice in the middle.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-      {{{13.0f}, {15.0f}}},
-        {1, 1, 1},
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-        {{7.0f, 8.0f}, {9.0f, 13.0f}, {11.0f, 15.0f}}});
+    RunR3<IndexT, DataT>(
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
+        {1, 1, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 13}, {11, 15}}});
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR3<IndexT>(
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-       {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 12.0f}}},
-      {{{13.0f}, {15.0f}}},
-        {1, 2, 1},
-      {{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
-        {{7.0f, 8.0f}, {9.0f, 10.0f}, {11.0f, 13.0f}}});
-    // clang-format on
+    RunR3<IndexT, DataT>(
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
+        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 13}}});
   }
 
-  template <typename IndexT>
-  void RunR1(const std::vector<float>& input_values,
-             const std::vector<float>& update_values,
+  template <typename IndexT, typename DataT>
+  void RunR1(tensorflow::gtl::ArraySlice<DataT> input_values,
+             tensorflow::gtl::ArraySlice<DataT> update_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<float>& expected_values) {
+             tensorflow::gtl::ArraySlice<DataT> expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR1<float>(input_values);
-    auto update = builder.ConstantR1<float>(update_values);
+    auto input = builder.ConstantR1<DataT>(input_values);
+    auto update = builder.ConstantR1<DataT>(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR1<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR1<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR2(const Array2D<float>& input_values,
-             const Array2D<float>& update_values,
+  template <typename IndexT, typename DataT>
+  void RunR2(const Array2D<DataT>& input_values,
+             const Array2D<DataT>& update_values,
              const std::vector<IndexT> slice_starts,
-             const Array2D<float>& expected_values) {
+             const Array2D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR2FromArray2D<float>(input_values);
-    auto update = builder.ConstantR2FromArray2D<float>(update_values);
+    auto input = builder.ConstantR2FromArray2D<DataT>(input_values);
+    auto update = builder.ConstantR2FromArray2D<DataT>(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR2<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR2<DataT>(&builder, expected_values, {start_data.get()});
   }
 
-  template <typename IndexT>
-  void RunR3(const Array3D<float>& input_values,
-             const Array3D<float>& update_values,
+  template <typename IndexT, typename DataT>
+  void RunR3(const Array3D<DataT>& input_values,
+             const Array3D<DataT>& update_values,
              const std::vector<IndexT> slice_starts,
-             const Array3D<float>& expected_values) {
+             const Array3D<DataT>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR3FromArray3D<float>(input_values);
-    auto update = builder.ConstantR3FromArray3D<float>(update_values);
+    auto input = builder.ConstantR3FromArray3D<DataT>(input_values);
+    auto update = builder.ConstantR3FromArray3D<DataT>(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR3<float>(&builder, expected_values, {start_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR3<DataT>(&builder, expected_values, {start_data.get()});
   }
 
   void RunR3Contiguous(std::vector<int32> operand_shape, int32 index,
@@ -393,23 +408,81 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 };
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64, double>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64, int64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32, float>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
+
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
+  // Slice at dimension start.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false},
+                     {true, true, false}, {0},
+                     {true, true, false, true, false, true, true, false});
+  // Slice in the middle.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false},
+                     {false, true, true}, {2},
+                     {false, false, false, true, true, true, true, false});
+  // Slice at dimension boundaries.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false},
+                     {false, true, true}, {5},
+                     {false, false, true, true, false, false, true, true});
+  // Zero-sized update.
+  RunR1<int32, bool>({false, false, true, true, false, true, true, false}, {},
+                     {2}, {false, false, true, true, false, true, true, false});
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R2Pred) {
+  // Slice at dimension start.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}},
+      {{true, false}}, {0, 0},
+      {{true, false, false}, {true, false, true}, {false, true, true}});
+  // Slice in the middle.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}},
+      {{true, false}}, {1, 1},
+      {{false, true, false}, {true, true, false}, {false, true, true}});
+  // Slice at dimension boundaries.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}},
+      {{true, false}}, {2, 1},
+      {{false, true, false}, {true, false, true}, {false, true, false}});
+  // Zero-sized update.
+  RunR2<int32, bool>(
+      {{false, true, false}, {true, false, true}, {false, true, true}}, {{}},
+      {2, 1}, {{false, true, false}, {true, false, true}, {false, true, true}});
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
+  // R3 Shape: [2, 3, 2]
+  // Slice at dimension start.
+  RunR3<int32, bool>(
+      {{{true, false}, {false, true}, {true, true}},
+       {{false, false}, {false, true}, {true, false}}},
+      {{{false, true}, {true, false}}, {{true, true}, {false, true}}},
+      {0, 0, 0},
+      {{{false, true}, {true, false}, {true, true}},
+       {{true, true}, {false, true}, {true, false}}});
+  // Slice in the middle.
+  RunR3<int32, bool>({{{true, false}, {false, true}, {true, true}},
+                      {{false, false}, {false, true}, {true, false}}},
+                     {{{false}, {true}}}, {1, 1, 1},
+                     {{{true, false}, {false, true}, {true, true}},
+                      {{false, false}, {false, false}, {true, true}}});
+}
 
 // Tests for simple R3 case where the update is contiguous (i.e. the minor
 // two dimensions are not sliced).

From 94a6e51cb1367e2a77db00f26694bb4c296af6f9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 12:07:06 -0700
Subject: [PATCH 29/56] Adding missing deps to targets in llvm.BUILD. This was
 only working in non-sandboxed builds.

PiperOrigin-RevId: 163103908
---
 third_party/llvm/llvm.BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 12bacf3c27d..2d96406d270 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -899,6 +899,7 @@ cc_library(
         "include/llvm/Target/ARM/InstPrinter/*.h",
         "include/llvm/Target/ARM/InstPrinter/*.def",
         "include/llvm/Target/ARM/InstPrinter/*.inc",
+        "lib/Target/ARM/*.h",
         "lib/Target/ARM/InstPrinter/*.h",
     ]),
     copts = ["-Iexternal/llvm/lib/Target/ARM"],
@@ -1206,6 +1207,7 @@ cc_library(
         "lib/IR/*.h",
     ]),
     hdrs = glob([
+        "include/llvm/Analysis/*.def",
         "include/llvm/IR/*.h",
         "include/llvm/IR/*.def",
         "include/llvm/IR/*.inc",
@@ -2022,6 +2024,8 @@ cc_library(
         "lib/Target/*.h",
     ]),
     hdrs = glob([
+        "include/llvm/CodeGen/*.h",
+        "include/llvm/CodeGen/*.def",
         "include/llvm/Target/*.h",
         "include/llvm/Target/*.def",
         "include/llvm/Target/*.inc",

From 036ea0c39071f41357846100f0bdfe51442a486a Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Tue, 25 Jul 2017 12:20:06 -0700
Subject: [PATCH 30/56] Pass batch_size in params when use_tpu=False.

PiperOrigin-RevId: 163105673
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 66 +++++++++----------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 712871cc04a..b6fa185709d 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -54,9 +54,12 @@ def _tpu_job(run_config):
   return None if run_config.master in ['', 'local'] else 'tpu_worker'
 
 
-def _per_shard_batch_size(global_batch_size, run_config):
+def _per_shard_batch_size(global_batch_size, run_config, use_tpu):
   """Returns the batch size for each shard."""
-  return global_batch_size // run_config.tpu_config.num_shards
+  if use_tpu:
+    return global_batch_size // run_config.tpu_config.num_shards
+  else:
+    return global_batch_size
 
 
 class _SIGNAL(object):
@@ -470,7 +473,7 @@ class _ModelFnWrapper(object):
     self._train_batch_size = train_batch_size
 
   def call_without_tpu(self, features, labels):
-    return self._call_model_fn(features, labels)
+    return self._call_model_fn(features, labels, False)
 
   def convert_to_single_tpu_train_step(self, dequeue_fn):
     """Converts the `model_fn` as a single train step on TPU."""
@@ -481,8 +484,8 @@ class _ModelFnWrapper(object):
       features, labels = dequeue_fn()
 
       # Makes deep copy with `config` and params` in case user mutates them.
-      estimator_spec = self._verify_estimator_spec(self._call_model_fn(
-          features, labels, add_batch_size_in_params=True))
+      estimator_spec = self._verify_estimator_spec(
+          self._call_model_fn(features, labels, True))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
       with ops.control_dependencies([train_op]):
         return array_ops.identity(loss)
@@ -492,7 +495,7 @@ class _ModelFnWrapper(object):
   def config(self):
     return self._config
 
-  def _call_model_fn(self, features, labels, add_batch_size_in_params=False):
+  def _call_model_fn(self, features, labels, use_tpu):
     """Calls the model_fn with required parameters."""
     model_fn_args = util.fn_args(self._model_fn)
     kwargs = {}
@@ -513,16 +516,15 @@ class _ModelFnWrapper(object):
     if 'params' in model_fn_args:
       kwargs['params'] = params
 
-    if add_batch_size_in_params:
-      if 'params' not in model_fn_args:
-        raise ValueError(
-            'model_fn ({}) does not include params argument, '
-            'required by TPUEstimator to pass batch size as '
-            'params[\'batch_size\']'.format(self._model_fn))
-      if self._mode == model_fn_lib.ModeKeys.TRAIN:
-        # For TPU training. `params` is never `None`.
-        params[_BATCH_SIZE_KEY] = _per_shard_batch_size(self._train_batch_size,
-                                                        config)
+    if 'params' not in model_fn_args:
+      raise ValueError(
+          'model_fn ({}) does not include params argument, '
+          'required by TPUEstimator to pass batch size as '
+          'params[\'batch_size\']'.format(self._model_fn))
+    if self._mode == model_fn_lib.ModeKeys.TRAIN:
+      # For TPU training. `params` is never `None`.
+      params[_BATCH_SIZE_KEY] = _per_shard_batch_size(
+          self._train_batch_size, config, use_tpu)
 
     return self._model_fn(features=features, **kwargs)
 
@@ -609,16 +611,12 @@ class TPUEstimator(estimator_lib.Estimator):
             'batch size {} must be divisible by number of shards {}'
             .format(train_batch_size, config.tpu_config.num_shards))
 
-    if use_tpu:
-      # Verifies the model_fn signature according to Estimator framework.
-      estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
-      # We cannot store config and params in this constructor as parent
-      # constructor might change them, such as assigning a temp dir for
-      # config.model_dir.
-      model_function = augment_model_fn_with_tpu_support(
-          model_fn, train_batch_size)
-    else:
-      model_function = model_fn
+    # Verifies the model_fn signature according to Estimator framework.
+    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
+    # We cannot store config and params in this constructor as parent
+    # constructor might change them, such as assigning a temp dir for
+    # config.model_dir.
+    model_function = _augment_model_fn(model_fn, train_batch_size, use_tpu)
 
     super(TPUEstimator, self).__init__(
         model_fn=model_function,
@@ -670,9 +668,6 @@ class TPUEstimator(estimator_lib.Estimator):
     Raises:
       ValueError: if input_fn takes invalid arguments or does not have `params`.
     """
-    if not self._use_tpu or mode != model_fn_lib.ModeKeys.TRAIN:
-      return super(TPUEstimator, self)._call_input_fn(input_fn, mode)
-
     input_fn_args = util.fn_args(input_fn)
     config = self.config  # a deep copy.
     kwargs = {}
@@ -686,8 +681,13 @@ class TPUEstimator(estimator_lib.Estimator):
       kwargs['config'] = config
 
     # Now for TPU training.
-    per_shard_batch_size = _per_shard_batch_size(self._train_batch_size, config)
-    kwargs['params'][_BATCH_SIZE_KEY] = per_shard_batch_size
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      kwargs['params'][_BATCH_SIZE_KEY] = (
+          _per_shard_batch_size(self._train_batch_size, config, self._use_tpu))
+
+    if not self._use_tpu or mode != model_fn_lib.ModeKeys.TRAIN:
+      with ops.device('/cpu:0'):
+        return input_fn(**kwargs)
 
     job = _tpu_job(config)
     def placement_function(index):
@@ -746,7 +746,7 @@ def _create_infeed_enqueue_ops_and_dequeue_fn(inputs_holder):
   return (dequeue_fn, enqueue_fn)
 
 
-def augment_model_fn_with_tpu_support(model_fn, train_batch_size):
+def _augment_model_fn(model_fn, train_batch_size, use_tpu):
   """Returns a new model_fn, which wraps the TPU support."""
 
   def _model_fn(features, labels, mode, config, params):
@@ -755,7 +755,7 @@ def augment_model_fn_with_tpu_support(model_fn, train_batch_size):
                                        train_batch_size)
 
     # TODO(jhseu): Move to EVAL and PREDICT to TPU.
-    if mode != model_fn_lib.ModeKeys.TRAIN:
+    if not use_tpu or mode != model_fn_lib.ModeKeys.TRAIN:
       return model_fn_wrapper.call_without_tpu(features, labels)
 
     inputs = _InputsHolder(sharded_features=features, sharded_labels=labels)

From 50d48d606c3b2c08eef249b6fe4f543a51ca8455 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 12:41:23 -0700
Subject: [PATCH 31/56] Remove duplicate import.

PiperOrigin-RevId: 163108237
---
 tensorflow/python/ops/init_ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 1e2f9999957..42b4f952bbc 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -41,7 +41,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import math_ops
 
 
 class Initializer(object):

From b4c97bf13b618fbdc22981ce04f9faf358da034c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 12:53:50 -0700
Subject: [PATCH 32/56] Implementation of UnsortedSegmentSum in tf2xla bridge.

PiperOrigin-RevId: 163109769
---
 tensorflow/compiler/tests/BUILD               |  14 ++
 .../tests/segment_reduction_ops_test.py       | 139 ++++++++++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |   1 +
 .../tf2xla/kernels/segment_reduction_ops.cc   | 155 ++++++++++++++++++
 4 files changed, 309 insertions(+)
 create mode 100644 tensorflow/compiler/tests/segment_reduction_ops_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 4f0137e8d96..c693f58f8bd 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -353,6 +353,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "segment_reduction_ops_test",
+    size = "small",
+    srcs = ["segment_reduction_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "spacetobatch_op_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py
new file mode 100644
index 00000000000..260a04421b6
--- /dev/null
+++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py
@@ -0,0 +1,139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for segment reduction ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+
+class SegmentReductionOpsTest(XLATestCase):
+  """Test cases for segment reduction ops."""
+
+  def UnsortedSegmentSum(self, data, indices, num_segments):
+    with self.test_session() as sess, self.test_scope():
+      d = array_ops.placeholder(data.dtype, shape=data.shape)
+      if isinstance(indices, int):
+        i = array_ops.placeholder(np.int32, shape=[])
+      else:
+        i = array_ops.placeholder(indices.dtype, shape=indices.shape)
+      return sess.run(
+          math_ops.unsorted_segment_sum(d, i, num_segments),
+          {d: data,
+           i: indices})
+
+  def testUnsortedSegmentSum0DIndices1DData(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array(
+              [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5],
+               [0, 0, 0, 0, 0, 0]],
+              dtype=dtype),
+          self.UnsortedSegmentSum(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype), 2, 4))
+
+  def testUnsortedSegmentSum1DIndices1DData(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array([1, 3, 2, 9], dtype=dtype),
+          self.UnsortedSegmentSum(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
+              np.array([3, 0, 2, 1, 3, 3], dtype=np.int32), 4))
+
+  def testUnsortedSegmentSum1DIndices2DDataDisjoint(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[0, 1, 2, 3], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43],
+           [50, 51, 52, 53]],
+          dtype=dtype)
+      indices = np.array([8, 1, 0, 3, 7], dtype=np.int32)
+      num_segments = 10
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[30, 31, 32, 33], [20, 21, 22, 23], [0, 0, 0, 0],
+               [40, 41, 42, 43], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
+               [50, 51, 52, 53], [0, 1, 2, 3], [0, 0, 0, 0]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSum1DIndices2DDataNonDisjoint(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[0, 1, 2, 3], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43],
+           [50, 51, 52, 53]],
+          dtype=dtype)
+      indices = np.array([0, 1, 2, 0, 1], dtype=np.int32)
+      num_segments = 4
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[40, 42, 44, 46], [70, 72, 74, 76], [30, 31, 32, 33],
+               [0, 0, 0, 0]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSum2DIndices3DData(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]],
+           [[200, 201, 202], [210, 211, 212]], [[300, 301, 302],
+                                                [310, 311, 312]]],
+          dtype=dtype)
+      indices = np.array([[3, 5], [3, 1], [5, 0], [6, 2]], dtype=np.int32)
+      num_segments = 8
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[210, 211, 212], [110, 111, 112], [310, 311, 312],
+               [100, 102, 104], [0, 0, 0.], [210, 212, 214], [300, 301,
+                                                              302], [0, 0, 0]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSum1DIndices3DData(self):
+    for dtype in self.numeric_types:
+      data = np.array(
+          [[[0, 1, 2], [10, 11, 12]], [[100, 101, 102], [110, 111, 112]],
+           [[200, 201, 202], [210, 211, 212]], [[300, 301, 302],
+                                                [310, 311, 312]]],
+          dtype=dtype)
+      indices = np.array([3, 0, 2, 5], dtype=np.int32)
+      num_segments = 6
+      y = self.UnsortedSegmentSum(data, indices, num_segments)
+      self.assertAllClose(
+          np.array(
+              [[[100, 101, 102.], [110, 111, 112]], [[0, 0, 0], [0, 0, 0]],
+               [[200, 201, 202], [210, 211, 212]], [[0, 1, 2.], [10, 11, 12]],
+               [[0, 0, 0], [0, 0, 0]], [[300, 301, 302], [310, 311, 312]]],
+              dtype=dtype), y)
+
+  def testUnsortedSegmentSumShapeError(self):
+    for dtype in self.numeric_types:
+      data = np.ones((4, 8, 7), dtype=dtype)
+      indices = np.ones((3, 2), dtype=np.int32)
+      num_segments = 4
+      self.assertRaises(ValueError,
+                        functools.partial(self.UnsortedSegmentSum, data,
+                                          indices, num_segments))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 35bc6b5a24e..546e9be8647 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -47,6 +47,7 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "segment_reduction_ops.cc",
         "select_op.cc",
         "sequence_ops.cc",
         "shape_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
new file mode 100644
index 00000000000..6a0ce775dc6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -0,0 +1,155 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sstream>
+#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace {
+
+class UnsortedSegmentSum : public XlaOpKernel {
+ public:
+  explicit UnsortedSegmentSum(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // output = unsorted_segment_sum(data, indices, num_segments)
+    // Compute a tensor such that:
+    //    output[i] = sum over {j where indices[j] == i} of data[j]
+    //    output[i] == 0 if i does not appear in indices
+    //
+    // Contrast with segment_sum(), which assumes indices are sorted and that
+    // max(indices)+1 is the desired size of the output.
+    //
+    // The returned output tensor has the same type as data, and the same shape
+    // as data with the first indices.rank dimensions are replaced
+    // by a single dimension with size num_segments.
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    auto data = ctx->Input(0);
+    auto data_shape = ctx->InputShape(0);
+
+    auto indices = ctx->Input(1);
+    auto indices_shape = ctx->InputShape(1);
+
+    OP_REQUIRES(ctx, data_shape.dims() >= indices_shape.dims(),
+                errors::InvalidArgument(
+                    "UnsortedSegmentSum requires that indices' rank be"
+                    " less than or equal to data's rank."));
+    // Validate that indices.shape is a prefix of data.shape.
+    for (int d = 0; d < indices_shape.dims(); ++d) {
+      OP_REQUIRES(ctx, (data_shape.dim_size(d) == indices_shape.dim_size(d)),
+                  errors::InvalidArgument(
+                      "UnsortedSegmentSum requires indices shape to be prefix"
+                      " of data_shape, but dimension ",
+                      d, " differs ", data_shape.dim_size(d), " vs. ",
+                      indices_shape.dim_size(d)));
+    }
+
+    int64 num_segments;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &num_segments));
+
+    // Flatten the indices into 1-D.
+    auto indices_1d = builder->Reshape(indices, {indices_shape.num_elements()});
+
+    // flatten data for dynamic indexing.
+    int64 out_tensor_dims = data_shape.dims() - indices_shape.dims();
+    std::vector<int64> flat_shape(1 + out_tensor_dims);
+    flat_shape[0] = indices_shape.num_elements();
+    for (int64 k = 0; k < out_tensor_dims; ++k) {
+      flat_shape[1 + k] = data_shape.dim_size(indices_shape.dims() + k);
+    }
+    auto data_flat = builder->Reshape(data, flat_shape);
+
+    // output shape; same as data_shape, but dimension 0 is num_segments.
+    std::vector<int64> out_shape(flat_shape);
+    out_shape[0] = num_segments;
+
+    // Pad the output array dims to rank >= 3 to work around lowering issues.
+    // TODO(b/37575001) This is awkward, and could be improved.
+    int64 extra_dims = 0;
+    if (out_shape.size() < 3) {
+      extra_dims = 3u - out_shape.size();
+    }
+    std::vector<int64> rshape(extra_dims + out_shape.size(), 1);
+    for (unsigned k = 0; k < out_shape.size(); ++k) {
+      rshape[extra_dims + k] = out_shape[k];
+    }
+    auto output = builder->Broadcast(XlaHelpers::Zero(builder, dtype_), rshape);
+
+    auto zero = builder->ConstantR1<int32>({0});
+
+    for (int64 i = 0; i < indices_shape.num_elements(); ++i) {
+      // output[indices[i]] += data[i]
+
+      std::vector<int64> data_start_indices(flat_shape.size());
+      data_start_indices[0] = i;
+      for (unsigned d = 1; d < flat_shape.size(); ++d) {
+        data_start_indices[d] = 0;
+      }
+      std::vector<int64> data_limit_indices(flat_shape);
+      data_limit_indices[0] = i + 1;
+      std::vector<int64> stride(flat_shape.size(), 1);
+
+      auto data_slice = builder->Slice(data_flat, data_start_indices,
+                                       data_limit_indices, stride);
+
+      // Reshape the sliced data into the R3+ shape to match output array.
+      std::vector<int64> rdata_shape(extra_dims + flat_shape.size());
+      for (int64 k = 0; k <= extra_dims; ++k) {
+        rdata_shape[k] = 1;
+      }
+      for (unsigned k = 1; k < data_limit_indices.size(); ++k) {
+        rdata_shape[extra_dims + k] = data_limit_indices[k];
+      }
+      auto rdata_slice = builder->Reshape(data_slice, rdata_shape);
+
+      auto index = builder->Slice(indices_1d, {i}, {i + 1}, {1});
+
+      // Construct the index into the R3+ output array 0, ..., <index>, 0, ...
+      std::vector<xla::ComputationDataHandle> out_start_index_parts(
+          extra_dims + flat_shape.size(), zero);
+      out_start_index_parts[extra_dims] = builder->Reshape(index, {1});
+      auto out_start_indices = builder->ConcatInDim(out_start_index_parts, 0);
+
+      std::vector<int64> slice_size(rshape);
+      slice_size[extra_dims] = 1;
+
+      auto out_slice =
+          builder->DynamicSlice(output, out_start_indices, slice_size);
+      auto sumval = builder->Add(out_slice, rdata_slice);
+      output = builder->DynamicUpdateSlice(output, sumval, out_start_indices);
+    }
+    auto reshaped_output = builder->Reshape(output, out_shape);
+    ctx->SetOutput(0, reshaped_output);
+  }
+
+ private:
+  DataType dtype_;
+};
+
+REGISTER_XLA_OP(Name("UnsortedSegmentSum"), UnsortedSegmentSum);
+
+}  // namespace
+}  // namespace tensorflow

From fbe6ae13a36c80454f032a37577ab199a67ef01d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 13:03:58 -0700
Subject: [PATCH 33/56] Add gradient checking tests for nn.moments().

PiperOrigin-RevId: 163110994
---
 tensorflow/python/ops/nn_test.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 87f6f92a8a8..cc8c6239470 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -830,7 +830,8 @@ class ReluTest(test_lib.TestCase):
 
 class MomentsTest(test_lib.TestCase):
 
-  def doOutputTest(self, input_shape, moments_axes, tol=1e-4):
+  def doOutputTest(self, input_shape, moments_axes, tol=1e-4,
+                   check_gradients=False):
     for mu in [0.0, 1.0, 1e3]:
       for sigma in [1.0, 0.1]:
         for keep_dims in [True, False]:
@@ -846,6 +847,15 @@ class MomentsTest(test_lib.TestCase):
               mean, variance = nn_impl.moments(
                   inputs, moments_axes, keep_dims=keep_dims)
 
+              if check_gradients:
+                err = gradient_checker.compute_gradient_error(
+                    inputs, input_shape, mean, mean.shape.as_list())
+                self.assertLess(err, 1e-3)
+                err = gradient_checker.compute_gradient_error(
+                    inputs, input_shape, variance, variance.shape.as_list())
+                self.assertLess(err, 1e-3)
+
+              # Evaluate.
               [mean, variance] = sess.run([mean, variance])
               # Make sure that there are no NaNs
               self.assertFalse(np.isnan(mean).any())
@@ -853,6 +863,12 @@ class MomentsTest(test_lib.TestCase):
               self.assertAllClose(mean, expected_mean, rtol=tol, atol=tol)
               self.assertAllClose(variance, expected_var, rtol=tol, atol=tol)
 
+  def testOutputAndGradient2DInput0(self):
+    self.doOutputTest((10, 10), (0,), check_gradients=True)
+
+  def testOutputAndGradient2DInput01(self):
+    self.doOutputTest((10, 10), (0, 1), check_gradients=True)
+
   def testOutput2DInput0(self):
     self.doOutputTest((10, 300), (0,))
 

From 6f70e57802a518bddca3d3991825d8b13ee3c61a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 25 Jul 2017 13:19:20 -0700
Subject: [PATCH 34/56] Improved the speed of constant folding

PiperOrigin-RevId: 163113085
---
 .../grappler/optimizers/constant_folding.cc   | 83 +++++++++----------
 1 file changed, 40 insertions(+), 43 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index db279ae67f3..7f845bb9e2c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -104,7 +104,8 @@ ConstantFolding::ConstantFolding() {
   ops_to_preserve_ = std::regex(
       "Placeholder.*|Const|.*Save.*|.*Restore.*|.*Reader|"
       "Enter|RefEnter|Exit|RefExit|NextIteration|RefNextIteration|"
-      ".*Quantized.*");
+      ".*Quantized.*",
+      std::regex_constants::optimize);
 }
 
 string ConstantFolding::AddControlDependency(const string& input_name) {
@@ -240,13 +241,18 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
 }
 
 bool ConstantFolding::IsFoldable(const NodeDef& node) const {
+  // Folding not applicable to ops with no inputs.
+  if (node.input().empty()) {
+    return false;
+  }
+
   // Skips nodes that must be preserved, and op_types that don't benefit from
   // folding
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-  std::cmatch match;
-  if (std::regex_match(node.op().c_str(), match, ops_to_preserve_)) {
+  if (std::regex_match(node.op().c_str(), ops_to_preserve_,
+                       std::regex_constants::match_any)) {
     return false;
   }
 
@@ -264,23 +270,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
-  DeviceTypeVector device_types;
-  status = SupportedDeviceTypesForNode({DeviceType(DEVICE_CPU)}, node,
-                                       &device_types);
-  if (!status.ok()) {
-    return false;
-  }
-  // Only fold ops with a CPU implementation available.
-  if (device_types.empty()) {
-    return false;
-  }
-  DCHECK_EQ(DeviceType(DEVICE_CPU), device_types[0]);
-
-  // Folding not applicable to ops with no inputs.
-  if (node.input().empty()) {
-    return false;
-  }
-
   // No need to (and don't) fold nodes that have no outgoing edges. Such nodes
   // could be introduced by an earlier constant folding pass and are preserved
   // in case users want to fetch their values; re-processing them would
@@ -391,12 +380,15 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       // Control dependency
       break;
     }
-    // There should be a single output since the input node should be a constant
-    // node.
-    TensorVector output;
-    TF_RETURN_IF_ERROR(
-        EvaluateNode(*node_map_->GetNode(input), TensorVector(), &output));
-    inputs.push_back(output[position]);
+    const NodeDef* input_node = node_map_->GetNode(input);
+    if (!IsConstant(*input_node)) {
+      return Status(error::INVALID_ARGUMENT,
+                    strings::StrCat("Can't fold ", node.name(), ", its ", input,
+                                    " isn't constant"));
+    }
+    Tensor* value = new Tensor(input_node->attr().at("dtype").type());
+    CHECK(value->FromProto(input_node->attr().at("value").tensor()));
+    inputs.emplace_back(value);
   }
 
   TensorVector output_tensors;
@@ -583,24 +575,31 @@ Status ConstantFolding::FoldNode(const NodeDef& node, GraphDef* output) {
 
 Status ConstantFolding::FoldGraph(GraphDef* output) {
   std::unordered_set<string> processed_nodes;
-  int previously_processed = 0;
-  do {
-    previously_processed = processed_nodes.size();
-    for (const auto& node : graph_.node()) {
-      if (IsFoldable(node) &&
-          processed_nodes.find(node.name()) == processed_nodes.end()) {
-        Status s = FoldNode(node, output);
-        if (!s.ok()) {
-          VLOG(1) << "Failed to fold node " << node.name() << ": " << s;
+  std::deque<const NodeDef*> queue;
+  for (const auto& node : graph_.node()) {
+    if (IsFoldable(node)) {
+      queue.push_back(&node);
+    }
+  }
+  while (!queue.empty()) {
+    const NodeDef* node = queue.front();
+    queue.pop_front();
+    if (processed_nodes.count(node->name())) {
+      continue;
+    }
+    Status s = FoldNode(*node, output);
+    processed_nodes.insert(node->name());
+    if (!s.ok()) {
+      VLOG(1) << "Failed to fold node " << node->name() << ": " << s;
+    } else {
+      auto outputs = node_map_->GetOutputs(node->name());
+      for (auto& output : outputs) {
+        if (IsFoldable(*output)) {
+          queue.push_back(output);
         }
-        processed_nodes.insert(node.name());
       }
     }
-    // Try again as long as we find new constants. In most cases, this loop will
-    // only run once since the graph is already in topological order.
-    VLOG(1) << "Folded " << processed_nodes.size() - previously_processed
-            << " nodes in this pass";
-  } while (previously_processed != processed_nodes.size());
+  }
 
   // Build the graph after constant folding. Note that we keep all processed
   // nodes in the graph in case users need to fetch their values.
@@ -740,7 +739,6 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   graph_ = item.graph;
-  LOG(INFO) << "Initial graph size: " << item.graph.node_size();
   node_map_.reset(new NodeMap(&graph_));
   for (const auto& node : item.fetch) {
     nodes_to_preserve_.insert(NodeName(node));
@@ -761,7 +759,6 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   TF_RETURN_IF_ERROR(FoldGraph(output));
   TF_RETURN_IF_ERROR(SimplifyGraph(output, properties));
-  LOG(INFO) << "Optimized graph size: " << output->node_size();
 
   *output->mutable_library() = item.graph.library();
   *output->mutable_versions() = item.graph.versions();

From 73ea287120184a694e611ad6cbc356fab9ad0f25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 13:30:03 -0700
Subject: [PATCH 35/56] Convert configure to python.

PiperOrigin-RevId: 163114551
---
 configure                                     | 814 +--------------
 configure.py                                  | 950 ++++++++++++++++++
 tensorflow/tools/ci_build/builds/configured   |   2 +-
 .../tools/ci_build/builds/run_pip_tests.sh    |   2 +-
 .../tools/ci_build/linux/cpu/run_cc_core.sh   |   2 +-
 .../tools/ci_build/linux/cpu/run_py2_core.sh  |   2 +-
 .../ci_build/linux/cpu/run_py3_contrib.sh     |   2 +-
 .../tools/ci_build/linux/cpu/run_py3_core.sh  |   2 +-
 .../tools/ci_build/linux/gpu/run_cc_core.sh   |   2 +-
 .../tools/ci_build/linux/gpu/run_py3_core.sh  |   2 +-
 .../tools/ci_build/osx/cpu/run_py2_cc_core.sh |   2 +-
 .../tools/ci_build/xla/linux/gpu/run_py3.sh   |   2 +-
 12 files changed, 965 insertions(+), 819 deletions(-)
 create mode 100644 configure.py

diff --git a/configure b/configure
index 1eeaffaf74c..c6df6992d9e 100755
--- a/configure
+++ b/configure
@@ -3,816 +3,12 @@
 set -e
 set -o pipefail
 
-MIN_BAZEL_VERSION=0.4.5
-
-# Find out the absolute path to where ./configure resides
-pushd `dirname $0` > /dev/null
-SOURCE_BASE_DIR=`pwd -P`
-popd > /dev/null
-
-PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
-
-function is_linux() {
-  [[ "${PLATFORM}" == "linux" ]]
-}
-
-function is_macos() {
-  [[ "${PLATFORM}" == "darwin" ]]
-}
-
-function is_windows() {
-  # On windows, the shell script is actually running in msys
-  [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]
-}
-
-function is_ppc64le() {
-  [[ "$(uname -m)" == "ppc64le" ]]
-}
-
-function sed_in_place() {
-  sed -e $1 $2 > "$2.bak"
-  mv "$2.bak" $2
-}
-
-function write_to_bazelrc() {
-  echo "$1" >> .tf_configure.bazelrc
-}
-
-function write_action_env_to_bazelrc() {
-  write_to_bazelrc "build --action_env $1=\"$2\""
-}
-
-function python_path {
-  "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import site
-import os
-
-try:
-  input = raw_input
-except NameError:
-  pass
-
-python_paths = []
-if os.getenv('PYTHONPATH') is not None:
-  python_paths = os.getenv('PYTHONPATH').split(':')
-try:
-  library_paths = site.getsitepackages()
-except AttributeError:
- from distutils.sysconfig import get_python_lib
- library_paths = [get_python_lib()]
-all_paths = set(python_paths + library_paths)
-
-paths = []
-for path in all_paths:
-  if os.path.isdir(path):
-    paths.append(path)
-
-print(",".join(paths))
-END
-}
-
-function setup_python {
-  ## Set up python-related environment settings:
-  while true; do
-    fromuser=""
-    if [ -z "$PYTHON_BIN_PATH" ]; then
-      default_python_bin_path=$(which python || which python3 || true)
-      read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
-      fromuser="1"
-      if [ -z "$PYTHON_BIN_PATH" ]; then
-        PYTHON_BIN_PATH=$default_python_bin_path
-      fi
-    fi
-    if [ -e "$PYTHON_BIN_PATH" ]; then
-      break
-    fi
-    echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
-    if [ -z "$fromuser" ]; then
-      exit 1
-    fi
-    PYTHON_BIN_PATH=""
-    # Retry
-  done
-
-  if [ -z "$PYTHON_LIB_PATH" ]; then
-    # Split python_path into an array of paths, this allows path containing spaces
-    IFS=',' read -r -a python_lib_path <<< "$(python_path)"
-
-    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
-      PYTHON_LIB_PATH=${python_lib_path[0]}
-      echo "Using python library path: $PYTHON_LIB_PATH"
-
-    else
-      echo "Found possible Python library paths:"
-      for x in "${python_lib_path[@]}"; do
-        echo "  $x"
-      done
-      set -- "${python_lib_path[@]}"
-      echo "Please input the desired Python library path to use.  Default is [$1]"
-      read b || true
-      if [ "$b" == "" ]; then
-        PYTHON_LIB_PATH=${python_lib_path[0]}
-        echo "Using python library path: $PYTHON_LIB_PATH"
-      else
-        PYTHON_LIB_PATH="$b"
-      fi
-    fi
-  fi
-
-  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
-    exit 1
-  fi
-
-  local python_major_version
-  python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);' | head -c1)
-  if [ -z "$python_major_version" ]; then
-    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
-    exit 1
-  fi
-
-  # Convert python path to Windows style before writing into bazel.rc
-  if is_windows; then
-    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
-    PYTHON_LIB_PATH="$(cygpath -m "$PYTHON_LIB_PATH")"
-  fi
-
-  # Set-up env variables used by python_configure.bzl
-  write_action_env_to_bazelrc "PYTHON_BIN_PATH" "$PYTHON_BIN_PATH"
-  write_action_env_to_bazelrc "PYTHON_LIB_PATH" "$PYTHON_LIB_PATH"
-  write_to_bazelrc "build --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "build --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
-  write_to_bazelrc "build --force_python=py$python_major_version"
-  write_to_bazelrc "build --host_force_python=py$python_major_version"
-  write_to_bazelrc "build --python${python_major_version}_path=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "test --force_python=py$python_major_version"
-  write_to_bazelrc "test --host_force_python=py$python_major_version"
-  write_to_bazelrc "test --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "test --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
-  write_to_bazelrc "run --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
-  write_to_bazelrc "run --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
-
-  # Write tools/python_bin_path.sh
-  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
-}
-
-function version {
-  echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }';
-}
-
-
-bazel version > bazel.version
-set +e
-curr_bazel_version=$(grep -m 1 'Build label:' bazel.version | cut -d ' ' -f3)
-set -e
-rm -f bazel.version
-
-
-echo "You have bazel $curr_bazel_version installed."
-if [ -z "$curr_bazel_version" ]; then
-  echo "WARNING: current bazel installation is not a release version."
-  echo "Make sure you are running at least bazel $MIN_BAZEL_VERSION."
-elif [ "$(version "$MIN_BAZEL_VERSION")" -gt "$(version "$curr_bazel_version")" ]; then
-  echo "Please upgrade your bazel installation to version $MIN_BAZEL_VERSION or higher to build TensorFlow!"
-  echo "Exiting..."
-  exit 1
+if [ -z "$PYTHON_BIN_PATH" ]; then
+  PYTHON_BIN_PATH=$(which python || which python3 || true)
 fi
 
-# This file contains customized config settings.
-rm -f .tf_configure.bazelrc
-touch .tf_configure.bazelrc
-if [[ ! -e .bazelrc ]]; then
-  if [[ -e "${HOME}/.bazelrc" ]]; then
-    echo "import ${HOME}/.bazelrc" >.bazelrc
-  else
-    touch .bazelrc
-  fi
-fi
-sed_in_place "/tf_configure/d" .bazelrc
-echo "import %workspace%/.tf_configure.bazelrc" >> .bazelrc
+# Set all env variables
+$PYTHON_BIN_PATH configure.py
 
-# Delete any leftover BUILD files from the Makefile build, which would interfere
-# with Bazel parsing.
-MAKEFILE_DOWNLOAD_DIR=tensorflow/contrib/makefile/downloads
-if [ -d "${MAKEFILE_DOWNLOAD_DIR}" ]; then
-  find ${MAKEFILE_DOWNLOAD_DIR} -type f -name '*BUILD' -delete
-fi
 
-setup_python
-
-## Set up MKL related environment settings
-write_to_bazelrc 'build:mkl --define with_mkl_support=true'
-write_to_bazelrc 'build:mkl --define using_mkl=true'
-write_to_bazelrc 'build:mkl -c opt'
-write_to_bazelrc 'build:mkl --copt="-DEIGEN_USE_VML"'
-echo ""
-echo "Add \"--config=mkl\" to your bazel command to build with MKL support."
-echo "Please note that MKL on MacOS or windows is still not supported."
-echo "If you would like to use a local MKL instead of downloading, please "
-echo "  set the environment variable \"TF_MKL_ROOT\" every time before build."
-echo ""
-## End MKL setup
-
-## Set up architecture-dependent optimization flags.
-if [ -z "$CC_OPT_FLAGS" ]; then
-  if is_ppc64le; then
-    # gcc on ppc64le does not support -march, use mcpu instead
-    default_cc_opt_flags="-mcpu=native"
-  else
-    default_cc_opt_flags="-march=native"
-  fi
-  read -p "Please specify optimization flags to use during compilation when bazel option "\
-"\"--config=opt\" is specified [Default is $default_cc_opt_flags]: " CC_OPT_FLAGS
-  if [ -z "$CC_OPT_FLAGS" ]; then
-    CC_OPT_FLAGS=$default_cc_opt_flags
-  fi
-fi
-
-if is_windows; then
-  TF_NEED_GCP=0
-  TF_NEED_HDFS=0
-  TF_NEED_JEMALLOC=0
-  TF_NEED_OPENCL=0
-  TF_CUDA_CLANG=0
-fi
-
-if is_linux; then
-  while [ "$TF_NEED_JEMALLOC" == "" ]; do
-    read -p "Do you wish to use jemalloc as the malloc implementation? [Y/n] "\
-      INPUT
-    case $INPUT in
-      [Yy]* ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;;
-      [Nn]* ) echo "jemalloc disabled"; TF_NEED_JEMALLOC=0;;
-      "" ) echo "jemalloc enabled"; TF_NEED_JEMALLOC=1;;
-      * ) echo "Invalid selection: " $INPUT;;
-    esac
-  done
-else
-  TF_NEED_JEMALLOC=0
-fi
-
-if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then
-  write_to_bazelrc 'build --define with_jemalloc=true'
-fi
-
-while [[ "$TF_NEED_GCP" == "" ]]; do
-  read -p "Do you wish to build TensorFlow with "\
-"Google Cloud Platform support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Google Cloud Platform support will be enabled for "\
-"TensorFlow"; TF_NEED_GCP=1;;
-    [Nn]* ) echo "No Google Cloud Platform support will be enabled for "\
-"TensorFlow"; TF_NEED_GCP=0;;
-    "" ) echo "No Google Cloud Platform support will be enabled for "\
-"TensorFlow"; TF_NEED_GCP=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_NEED_GCP" == "1" ]]; then
-  write_to_bazelrc 'build --define with_gcp_support=true'
-fi
-
-while [[ "$TF_NEED_HDFS" == "" ]]; do
-  read -p "Do you wish to build TensorFlow with "\
-"Hadoop File System support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Hadoop File System support will be enabled for "\
-"TensorFlow"; TF_NEED_HDFS=1;;
-    [Nn]* ) echo "No Hadoop File System support will be enabled for "\
-"TensorFlow"; TF_NEED_HDFS=0;;
-    "" ) echo "No Hadoop File System support will be enabled for "\
-"TensorFlow"; TF_NEED_HDFS=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_NEED_HDFS" == "1" ]]; then
-  write_to_bazelrc 'build --define with_hdfs_support=true'
-fi
-
-## Enable XLA.
-while [[ "$TF_ENABLE_XLA" == "" ]]; do
-  read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;;
-    [Nn]* ) echo "No XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=0;;
-    "" ) echo "No XLA support will be enabled for TensorFlow"; TF_ENABLE_XLA=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_ENABLE_XLA" == "1" ]]; then
-  write_to_bazelrc 'build --define with_xla_support=true'
-fi
-
-# Verbs configuration
-while [ "$TF_NEED_VERBS" == "" ]; do
-  read -p "Do you wish to build TensorFlow with "\
-"VERBS support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "VERBS support will be enabled for "\
-"TensorFlow"; TF_NEED_VERBS=1;;
-    [Nn]* ) echo "No VERBS support will be enabled for "\
-"TensorFlow"; TF_NEED_VERBS=0;;
-    "" ) echo "No VERBS support will be enabled for "\
-"TensorFlow"; TF_NEED_VERBS=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-if [[ "$TF_NEED_VERBS" == "1" ]]; then
-  write_to_bazelrc 'build --define with_verbs_support=true'
-fi
-
-# Append CC optimization flags to bazel.rc
-for opt in $CC_OPT_FLAGS; do
-  write_to_bazelrc "build:opt --cxxopt=$opt --copt=$opt"
-done
-
-# Run the gen_git_source to create links where bazel can track dependencies for
-# git hash propagation
-GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
-chmod a+x ${GEN_GIT_SOURCE}
-"${PYTHON_BIN_PATH}" ${GEN_GIT_SOURCE} --configure "${SOURCE_BASE_DIR}"
-
-## Set up SYCL-related environment settings
-while [ "$TF_NEED_OPENCL" == "" ]; do
-  read -p "Do you wish to build TensorFlow with OpenCL support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=1;;
-    [Nn]* ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
-    "" ) echo "No OpenCL support will be enabled for TensorFlow"; TF_NEED_OPENCL=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-## Set up Cuda-related environment settings
-
-while [ "$TF_NEED_CUDA" == "" ]; do
-  read -p "Do you wish to build TensorFlow with CUDA support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=1;;
-    [Nn]* ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
-    "" ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-export TF_NEED_CUDA
-write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA"
-
-export TF_NEED_OPENCL
-write_action_env_to_bazelrc "TF_NEED_OPENCL" "$TF_NEED_OPENCL"
-
-if [ "$TF_NEED_CUDA" == "1" ]; then
-while [[ "$TF_CUDA_CLANG" == "" ]]; do
-  read -p "Do you want to use clang as CUDA compiler? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Clang will be used as CUDA compiler"; TF_CUDA_CLANG=1;;
-    [Nn]* ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
-    "" ) echo "nvcc will be used as CUDA compiler"; TF_CUDA_CLANG=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-export TF_CUDA_CLANG
-write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"
-
-# Set up which clang we should use as the cuda / host compiler.
-while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
-  fromuser=""
-  if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
-    default_clang_host_compiler_path=$(which clang || true)
-    read -p "Please specify which clang should be used as device and host compiler. [Default is $default_clang_host_compiler_path]: " CLANG_CUDA_COMPILER_PATH
-    fromuser="1"
-    if [ -z "$CLANG_CUDA_COMPILER_PATH" ]; then
-      CLANG_CUDA_COMPILER_PATH="$default_clang_host_compiler_path"
-    fi
-  fi
-  if [ -e "$CLANG_CUDA_COMPILER_PATH" ]; then
-    export CLANG_CUDA_COMPILER_PATH
-    write_action_env_to_bazelrc "CLANG_CUDA_COMPILER_PATH" "$CLANG_CUDA_COMPILER_PATH"
-    break
-  fi
-  echo "Invalid clang path. ${CLANG_CUDA_COMPILER_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  CLANG_CUDA_COMPILER_PATH=""
-  # Retry
-done
-
-# Find out where the CUDA toolkit is installed
-while true; do
-  # Configure the Cuda SDK version to use.
-  if [ -z "$TF_CUDA_VERSION" ]; then
-    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: " TF_CUDA_VERSION
-  fi
-  # Set default CUDA version if not set
-  TF_CUDA_VERSION=${TF_CUDA_VERSION:-8.0}
-
-  fromuser=""
-  if [ -z "$CUDA_TOOLKIT_PATH" ]; then
-    default_cuda_path=/usr/local/cuda
-    if is_windows; then
-      if [ -z "$CUDA_PATH" ]; then
-        default_cuda_path="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0"
-      else
-        default_cuda_path="$(cygpath -m "$CUDA_PATH")"
-      fi
-    elif is_linux; then
-      # If the default doesn't exist, try an alternative default.
-      if [ ! -d $default_cuda_path ] && [ -d /opt/cuda ]; then
-        default_cuda_path=/opt/cuda
-      fi
-    fi
-    read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
-    fromuser="1"
-    if [ -z "$CUDA_TOOLKIT_PATH" ]; then
-      CUDA_TOOLKIT_PATH="$default_cuda_path"
-    fi
-  fi
-
-  if [[ -z "$TF_CUDA_VERSION" ]]; then
-    TF_CUDA_EXT=""
-  else
-    TF_CUDA_EXT=".$TF_CUDA_VERSION"
-  fi
-
-  if is_windows; then
-    CUDA_RT_LIB_PATH="lib/x64/cudart.lib"
-  elif is_linux; then
-    CUDA_RT_LIB_PATH="lib64/libcudart.so${TF_CUDA_EXT}"
-  elif is_macos; then
-    CUDA_RT_LIB_PATH="lib/libcudart${TF_CUDA_EXT}.dylib"
-  fi
-
-  if [ -e "${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH}" ]; then
-    export CUDA_TOOLKIT_PATH
-    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
-    export TF_CUDA_VERSION
-    break
-  fi
-  echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
-
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  # Retry
-  TF_CUDA_VERSION=""
-  CUDA_TOOLKIT_PATH=""
-done
-
-export TF_CUDA_VERSION
-write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
-
-# Set up which gcc nvcc should use as the host compiler
-# No need to set this on Windows
-while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
-  fromuser=""
-  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-    default_gcc_host_compiler_path=$(which gcc || true)
-    cuda_bin_symlink="$CUDA_TOOLKIT_PATH/bin/gcc"
-    if [ -L "$cuda_bin_symlink" ]; then
-      default_gcc_host_compiler_path=$(readlink $cuda_bin_symlink)
-    fi
-    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
-    fromuser="1"
-    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
-    fi
-  fi
-  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
-    export GCC_HOST_COMPILER_PATH
-    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
-    break
-  fi
-  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  GCC_HOST_COMPILER_PATH=""
-  # Retry
-done
-
-# Find out where the cuDNN library is installed
-while true; do
-  # Configure the cuDNN version to use.
-  if [ -z "$TF_CUDNN_VERSION" ]; then
-    read -p "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: " TF_CUDNN_VERSION
-  fi
-  # Set default CUDNN version if not set
-  TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-6}
-
-  fromuser=""
-  if [ -z "$CUDNN_INSTALL_PATH" ]; then
-    default_cudnn_path=${CUDA_TOOLKIT_PATH}
-    read -p "Please specify the location where cuDNN $TF_CUDNN_VERSION library is installed. Refer to README.md for more details. [Default is $default_cudnn_path]: " CUDNN_INSTALL_PATH
-    fromuser="1"
-    if [ -z "$CUDNN_INSTALL_PATH" ]; then
-      CUDNN_INSTALL_PATH=$default_cudnn_path
-    fi
-    # Result returned from "read" will be used unexpanded. That make "~" unusable.
-    # Going through one more level of expansion to handle that.
-    CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
-    if is_windows; then
-      CUDNN_INSTALL_PATH="$(cygpath -m "$CUDNN_INSTALL_PATH")"
-    fi
-  fi
-
-  if [[ -z "$TF_CUDNN_VERSION" ]]; then
-    TF_CUDNN_EXT=""
-  else
-    TF_CUDNN_EXT=".$TF_CUDNN_VERSION"
-  fi
-
-  if is_windows; then
-    CUDA_DNN_LIB_PATH="lib/x64/cudnn.lib"
-    CUDA_DNN_LIB_ALT_PATH="lib/x64/cudnn.lib"
-  elif is_linux; then
-    CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
-    CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
-  elif is_macos; then
-    CUDA_DNN_LIB_PATH="lib/libcudnn${TF_CUDNN_EXT}.dylib"
-    CUDA_DNN_LIB_ALT_PATH="libcudnn${TF_CUDNN_EXT}.dylib"
-  fi
-
-  if [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_ALT_PATH}" ] || [ -e "$CUDNN_INSTALL_PATH/${CUDA_DNN_LIB_PATH}" ]; then
-    export TF_CUDNN_VERSION
-    write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
-    export CUDNN_INSTALL_PATH
-    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
-    break
-  fi
-
-  if is_linux; then
-    if ! type ldconfig > /dev/null 2>&1; then
-        LDCONFIG_BIN=/sbin/ldconfig
-    else
-        LDCONFIG_BIN=ldconfig
-    fi
-    CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
-    if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
-      export TF_CUDNN_VERSION
-      export CUDNN_INSTALL_PATH
-      CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
-      write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
-      break
-    fi
-  fi
-  echo "Invalid path to cuDNN ${CUDNN_VERSION} toolkit. Neither of the following two files can be found:"
-  echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_PATH}"
-  echo "${CUDNN_INSTALL_PATH}/${CUDA_DNN_LIB_ALT_PATH}"
-  if is_linux; then
-    echo "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}"
-  fi
-
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  # Retry
-  TF_CUDNN_VERSION=""
-  CUDNN_INSTALL_PATH=""
-done
-
-export TF_CUDNN_VERSION
-write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
-
-# Configure the compute capabilities that TensorFlow builds for.
-# Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
-function get_native_cuda_compute_capabilities {
-  device_query_bin="$CUDA_TOOLKIT_PATH/extras/demo_suite/deviceQuery" # Also works on Windows without .exe
-  "$device_query_bin" | grep 'Capability' | grep -o '[0-9]*\.[0-9]*' | sed ':a;{N;s/\n/,/};ba'
-  exit 0 # ensure that this function always exit success even if device detection fails, to prevent the whole configure from aborting
-}
-while true; do
-  fromuser=""
-  native_cuda_compute_capabilities=$(get_native_cuda_compute_capabilities)
-  default_cuda_compute_capabilities=${native_cuda_compute_capabilities:-"3.5,5.2"}
-  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
-cat << EOF
-Please specify a list of comma-separated Cuda compute capabilities you want to build with.
-You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your build time and binary size.
-EOF
-    read -p "[Default is: \"$default_cuda_compute_capabilities\"]: " TF_CUDA_COMPUTE_CAPABILITIES
-    fromuser=1
-  fi
-  if [ -z "$TF_CUDA_COMPUTE_CAPABILITIES" ]; then
-    TF_CUDA_COMPUTE_CAPABILITIES=$default_cuda_compute_capabilities
-  fi
-  # Check whether all capabilities from the input is valid
-  COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES//,/ }
-  ALL_VALID=1
-  for CAPABILITY in $COMPUTE_CAPABILITIES; do
-    if [[ ! "$CAPABILITY" =~ [0-9]+.[0-9]+ ]]; then
-      echo "Invalid compute capability: " $CAPABILITY
-      ALL_VALID=0
-      break
-    fi
-  done
-  if [ "$ALL_VALID" == "0" ]; then
-    if [ -z "$fromuser" ]; then
-      exit 1
-    fi
-  else
-    export TF_CUDA_COMPUTE_CAPABILITIES
-    write_action_env_to_bazelrc "TF_CUDA_COMPUTE_CAPABILITIES" "$TF_CUDA_COMPUTE_CAPABILITIES"
-    break
-  fi
-  TF_CUDA_COMPUTE_CAPABILITIES=""
-done
-
-if is_windows; then
-  # The following three variables are needed for MSVC toolchain configuration in Bazel
-  export CUDA_PATH="$CUDA_TOOLKIT_PATH"
-  export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
-  export NO_WHOLE_ARCHIVE_OPTION=1
-  write_action_env_to_bazelrc "CUDA_PATH" "$CUDA_PATH"
-  write_action_env_to_bazelrc "CUDA_COMPUTE_CAPABILITIES" "$CUDA_COMPUTE_CAPABILITIES"
-  write_action_env_to_bazelrc "NO_WHOLE_ARCHIVE_OPTION" "1"
-  write_to_bazelrc "build --config=win-cuda"
-  write_to_bazelrc "test --config=win-cuda"
-else
-  # If CUDA is enabled, always use GPU during build and test.
-  if [ "$TF_CUDA_CLANG" == "1" ]; then
-    write_to_bazelrc "build --config=cuda_clang"
-    write_to_bazelrc "test --config=cuda_clang"
-  else
-    write_to_bazelrc "build --config=cuda"
-    write_to_bazelrc "test --config=cuda"
-  fi
-fi
-
-# end of if "$TF_NEED_CUDA" == "1"
-fi
-
-# OpenCL configuration
-
-if [ "$TF_NEED_OPENCL" == "1" ]; then
-
-# Determine which C++ compiler should be used as the host compiler
-while true; do
-  fromuser=""
-  if [ -z "$HOST_CXX_COMPILER" ]; then
-    default_cxx_host_compiler=$(which g++ || true)
-    read -p "Please specify which C++ compiler should be used as the host C++ compiler. [Default is $default_cxx_host_compiler]: " HOST_CXX_COMPILER
-    fromuser="1"
-    if [ -z "$HOST_CXX_COMPILER" ]; then
-      HOST_CXX_COMPILER=$default_cxx_host_compiler
-    fi
-  fi
-  if [ -e "$HOST_CXX_COMPILER" ]; then
-    export HOST_CXX_COMPILER
-    write_action_env_to_bazelrc "HOST_CXX_COMPILER" "$HOST_CXX_COMPILER"
-    break
-  fi
-  echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  HOST_CXX_COMPILER=""
-  # Retry
-done
-
-# Determine which C compiler should be used as the host compiler
-while true; do
-  fromuser=""
-  if [ -z "$HOST_C_COMPILER" ]; then
-    default_c_host_compiler=$(which gcc || true)
-    read -p "Please specify which C compiler should be used as the host C compiler. [Default is $default_c_host_compiler]: " HOST_C_COMPILER
-    fromuser="1"
-    if [ -z "$HOST_C_COMPILER" ]; then
-      HOST_C_COMPILER=$default_c_host_compiler
-    fi
-  fi
-  if [ -e "$HOST_C_COMPILER" ]; then
-    export HOST_C_COMPILER
-    write_action_env_to_bazelrc "HOST_C_COMPILER" "$HOST_C_COMPILER"
-    break
-  fi
-  echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  HOST_C_COMPILER=""
-  # Retry
-done
-
-while true; do
-  # Configure the OPENCL version to use.
-  TF_OPENCL_VERSION="1.2"
-
-  # Point to ComputeCpp root
-  if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
-    default_computecpp_toolkit_path=/usr/local/computecpp
-    read -p "Please specify the location where ComputeCpp for SYCL $TF_OPENCL_VERSION is installed. [Default is $default_computecpp_toolkit_path]: " COMPUTECPP_TOOLKIT_PATH
-    fromuser="1"
-    if [ -z "$COMPUTECPP_TOOLKIT_PATH" ]; then
-      COMPUTECPP_TOOLKIT_PATH=$default_computecpp_toolkit_path
-    fi
-  fi
-
-  if is_linux; then
-    SYCL_RT_LIB_PATH="lib/libComputeCpp.so"
-  fi
-
-  if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then
-    export COMPUTECPP_TOOLKIT_PATH
-    write_action_env_to_bazelrc "COMPUTECPP_TOOLKIT_PATH" "$COMPUTECPP_TOOLKIT_PATH"
-    break
-  fi
-  echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
-
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  # Retry
-  TF_OPENCL_VERSION=""
-  COMPUTECPP_TOOLKIT_PATH=""
-done
-
-# end of if "$TF_NEED_OPENCL" == "1"
-fi
-
-
-while [ "$TF_NEED_MPI" == "" ]; do
-  read -p "Do you wish to build TensorFlow with "\
-"MPI support? [y/N] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "MPI support will be enabled for "\
-"TensorFlow"; TF_NEED_MPI=1;;
-    [Nn]* ) echo "MPI support will not be enabled for "\
-"TensorFlow"; TF_NEED_MPI=0;;
-    "" ) echo "MPI support will not be enabled for "\
-"TensorFlow"; TF_NEED_MPI=0;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
-
-# Find out where the MPI toolkit is installed
-while true; do
-    if [ "$TF_NEED_MPI" == "0" ]; then
-        break;
-    fi
-
-    fromuser=""
-    if [ -z "$MPI_HOME" ]; then
-        #Get the base folder by removing the bin path
-        default_mpi_path=$(dirname $(dirname $(which mpirun)) || dirname $(dirname $(which mpiexec))  || true)
-        read -p "Please specify the MPI toolkit folder. [Default is $default_mpi_path]: " MPI_HOME
-        fromuser="1"
-        if [ -z "$MPI_HOME" ]; then
-            MPI_HOME=$default_mpi_path
-        fi
-    fi
-
-    #Check that the include and library folders are where we expect them to be
-    if [ -e "$MPI_HOME/include" ] && [ -e "$MPI_HOME/lib" ]; then
-        break
-    fi
-
-    echo "Invalid path to the MPI Toolkit. ${MPI_HOME}/include or ${MPI_HOME}/lib cannot be found."
-    if [ -z "$fromuser" ]; then
-        exit 1
-    fi
-
-    # Retry
-    MPI_HOME=""
-done
-
-
-if [ "$TF_NEED_MPI" == "1" ]; then
-  write_to_bazelrc 'build --define with_mpi_support=true'
-
-  #Link the MPI header files
-  ln -sf "${MPI_HOME}/include/mpi.h" third_party/mpi/mpi.h
-
-
-  #Determine if we use OpenMPI or MVAPICH, these require different header files
-  #to be included here to make bazel dependency checker happy
-
-  if [ -e "${MPI_HOME}/include/mpi_portable_platform.h" ]; then
-        #OpenMPI
-        ln -sf "${MPI_HOME}/include/mpi_portable_platform.h" third_party/mpi/
-        sed -i -e "s/MPI_LIB_IS_OPENMPI=False/MPI_LIB_IS_OPENMPI=True/" third_party/mpi/mpi.bzl
- else
-        #MVAPICH / MPICH
-        ln -sf "${MPI_HOME}/include/mpio.h" third_party/mpi/
-        ln -sf "${MPI_HOME}/include/mpicxx.h" third_party/mpi/
-        sed -i -e "s/MPI_LIB_IS_OPENMPI=True/MPI_LIB_IS_OPENMPI=False/" third_party/mpi/mpi.bzl
- fi
-
-
-  if [ -e "${MPI_HOME}/lib/libmpi.so" ]; then
-    ln -sf "${MPI_HOME}/lib/libmpi.so" third_party/mpi/
-  else
-    echo "Cannot find the MPI library file in ${MPI_HOME}/lib "
-    exit 1
-  fi
-fi
-
-
-echo "Configuration finished"
+echo "Configuration finished"
\ No newline at end of file
diff --git a/configure.py b/configure.py
new file mode 100644
index 00000000000..fac00d1b74b
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,950 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""configure script to get build parameters from user."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import platform
+import re
+import site
+import subprocess
+import sys
+
+_TF_BAZELRC = '.tf_configure.bazelrc'
+_DEFAULT_CUDA_VERSION = '8.0'
+_DEFAULT_CUDNN_VERSION = '6'
+_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
+_DEFAULT_CUDA_PATH = '/usr/local/cuda'
+_DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
+_DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
+                          'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
+_TF_OPENCL_VERSION = '1.2'
+_DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
+
+
+def is_windows():
+  return platform.system() == 'Windows'
+
+
+def is_linux():
+  return platform.system() == 'Linux'
+
+
+def is_macos():
+  return platform.system() == 'Darwin'
+
+
+def is_ppc64le():
+  return platform.machine() == 'ppc64le'
+
+
+def get_input(question):
+  try:
+    try:
+      answer = raw_input(question)
+    except NameError:
+      answer = input(question)  # pylint: disable=bad-builtin
+  except EOFError:
+    answer = ''
+  return answer
+
+
+def symlink_force(target, link_name):
+  """Force symlink, equivalent of 'ln -sf'.
+
+  Args:
+    target: items to link to.
+    link_name: name of the link.
+  """
+  try:
+    os.symlink(target, link_name)
+  except OSError as e:
+    if e.errno == errno.EEXIST:
+      os.remove(link_name)
+      os.symlink(target, link_name)
+    else:
+      raise e
+
+
+def sed_in_place(filename, old, new):
+  """Replace old string with new string in file.
+
+  Args:
+    filename: string for filename.
+    old: string to replace.
+    new: new string to replace to.
+  """
+  with open(filename, 'r') as f:
+    filedata = f.read()
+  newdata = filedata.replace(old, new)
+  with open(filename, 'w') as f:
+    f.write(newdata)
+
+
+def remove_line_with(filename, token):
+  """Remove lines that contain token from file.
+
+  Args:
+    filename: string for filename.
+    token: string token to check if to remove a line from file or not.
+  """
+  with open(filename, 'r') as f:
+    filedata = f.read()
+
+  with open(filename, 'w') as f:
+    for line in filedata.strip().split('\n'):
+      if token not in line:
+        f.write(line + '\n')
+
+
+def write_to_bazelrc(line):
+  with open(_TF_BAZELRC, 'a') as f:
+    f.write(line + '\n')
+
+
+def write_action_env_to_bazelrc(var_name, var):
+  write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var)))
+
+
+def run_shell(cmd):
+  return subprocess.check_output(cmd, shell=True).decode('UTF-8').strip()
+
+
+def cygpath(path):
+  """Convert path from posix to windows."""
+  return run_shell('cygpath  -m "%s"' % path)
+
+
+def get_python_path(environ_cp):
+  """Get the python site package paths."""
+  python_paths = []
+  if environ_cp.get('PYTHONPATH'):
+    python_paths = environ_cp.get('PYTHONPATH').split(':')
+  try:
+    library_paths = site.getsitepackages()
+  except AttributeError:
+    from distutils.sysconfig import get_python_lib  # pylint: disable=g-import-not-at-top
+    library_paths = [get_python_lib()]
+  all_paths = set(python_paths + library_paths)
+
+  paths = []
+  for path in all_paths:
+    if os.path.isdir(path):
+      paths.append(path)
+  return paths
+
+
+def setup_python(environ_cp):
+  """Setup python related env variables."""
+  # Get PYTHON_BIN_PATH, default is the current running python.
+  default_python_bin_path = sys.executable
+  ask_python_bin_path = ('Please specify the location of python. [Default is '
+                         '%s]: ') % default_python_bin_path
+  while True:
+    python_bin_path = get_from_env_or_user_or_default(
+        environ_cp, 'PYTHON_BIN_PATH', ask_python_bin_path,
+        default_python_bin_path)
+    # Check if the path is valid
+    if (os.path.isfile(python_bin_path) and os.access(
+        python_bin_path, os.X_OK)) or (os.path.isdir(python_bin_path)):
+      break
+    elif not os.path.exists(python_bin_path):
+      print('Invalid python path: %s cannot be found.' % python_bin_path)
+    else:
+      print('%s is not executable.  Is it the python binary?' % python_bin_path)
+    environ_cp['PYTHON_BIN_PATH'] = ''
+
+  # Get PYTHON_LIB_PATH
+  python_lib_path = environ_cp.get('PYTHON_LIB_PATH')
+  if not python_lib_path:
+    python_lib_paths = get_python_path(environ_cp)
+    if environ_cp.get('USE_DEFAULT_PYTHON_LIB_PATH') == '1':
+      environ_cp['PYTHON_LIB_PATH'] = python_lib_paths[0]
+    else:
+      print('Found possible Python library paths:\n%s' %
+            '\n'.join(python_lib_paths))
+      default_python_lib_path = python_lib_paths[0]
+      python_lib_path = get_input(
+          'Please input the desired Python library path to use.  Default is %s'
+          % python_lib_paths[0])
+      if not python_lib_path:
+        python_lib_path = default_python_lib_path
+      environ_cp['PYTHON_LIB_PATH'] = python_lib_path
+
+  python_major_version = sys.version_info[0]
+  # Convert python path to Windows style before writing into bazel.rc
+  if is_windows():
+    python_bin_path = cygpath(python_bin_path)
+    python_lib_path = cygpath(python_lib_path)
+
+  # Set-up env variables used by python_configure.bzl
+  write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
+  write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
+  write_to_bazelrc('build --define PYTHON_BIN_PATH="%s"' % python_bin_path)
+  write_to_bazelrc('build --define PYTHON_LIB_PATH="%s"' % python_lib_path)
+  write_to_bazelrc('build --force_python=py%s' % python_major_version)
+  write_to_bazelrc('build --host_force_python=py%s' % python_major_version)
+  write_to_bazelrc('build --python%s_path=\"%s"' % (python_major_version,
+                                                    python_bin_path))
+  write_to_bazelrc('test --force_python=py%s' % python_major_version)
+  write_to_bazelrc('test --host_force_python=py%s' % python_major_version)
+  write_to_bazelrc('test --define PYTHON_BIN_PATH="%s"' % python_bin_path)
+  write_to_bazelrc('test --define PYTHON_LIB_PATH="%s"' % python_lib_path)
+  write_to_bazelrc('run --define PYTHON_BIN_PATH="%s"' % python_bin_path)
+  write_to_bazelrc('run --define PYTHON_LIB_PATH="%s"' % python_lib_path)
+  environ_cp['PYTHON_BIN_PATH'] = python_bin_path
+
+  # Write tools/python_bin_path.sh
+  with open('tools/python_bin_path.sh', 'w') as f:
+    f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
+
+
+def reset_tf_configure_bazelrc():
+  """Reset file that contains customized config settings."""
+  open(_TF_BAZELRC, 'w').close()
+
+  home = os.path.expanduser('~')
+  if not os.path.exists('.bazelrc'):
+    if os.path.exists(os.path.join(home, '.bazelrc')):
+      with open('.bazelrc', 'a') as f:
+        f.write('import %s/.bazelrc\n' % home)
+    else:
+      open('.bazelrc', 'w').close()
+
+  remove_line_with('.bazelrc', 'tf_configure')
+  with open('.bazelrc', 'a') as f:
+    f.write('import %workspace%/.tf_configure.bazelrc\n')
+
+
+def run_gen_git_source(environ_cp):
+  """Run the gen_git_source to create links.
+
+  The links are for bazel to track dependencies for git hash propagation.
+
+  Args:
+    environ_cp: copy of the os.environ.
+  """
+  cmd = '%s tensorflow/tools/git/gen_git_source.py --configure %s' % (
+      environ_cp.get('PYTHON_BIN_PATH'), os.getcwd())
+  os.system(cmd)
+
+
+def cleanup_makefile():
+  """Delete any leftover BUILD files from the Makefile build.
+
+  These files could interfere with Bazel parsing.
+  """
+  makefile_download_dir = 'tensorflow/contrib/makefile/downloads'
+  if os.path.isdir(makefile_download_dir):
+    for root, _, filenames in os.walk(makefile_download_dir):
+      for f in filenames:
+        if f.endswith('BUILD'):
+          os.remove(os.path.join(root, f))
+
+
+def get_var(environ_cp,
+            var_name,
+            query_item,
+            enabled_by_default,
+            question=None,
+            yes_reply=None,
+            no_reply=None):
+  """Get boolean input from user.
+
+  If var_name is not set in env, ask user to enable query_item or not. If the
+  response is empty, use the default.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    query_item: string for feature related to the variable, e.g. "Hadoop File
+      System".
+    enabled_by_default: boolean for default behavior.
+    question: optional string for how to ask for user input.
+    yes_reply: optionanl string for reply when feature is enabled.
+    no_reply: optional string for reply when feature is disabled.
+
+  Returns:
+    boolean value of the variable.
+  """
+  if not question:
+    question = 'Do you wish to build TensorFlow with %s support?' % query_item
+  if not yes_reply:
+    yes_reply = '%s support will be enabled for TensorFlow.' % query_item
+  if not no_reply:
+    no_reply = 'No %s' % yes_reply
+
+  yes_reply += '\n'
+  no_reply += '\n'
+
+  if enabled_by_default:
+    question += ' [Y/n]: '
+  else:
+    question += ' [y/N]: '
+
+  var = environ_cp.get(var_name)
+  while var is None:
+    user_input_origin = get_input(question)
+    user_input = user_input_origin.strip().lower()
+    if user_input == 'y':
+      print(yes_reply)
+      var = True
+    elif user_input == 'n':
+      print(no_reply)
+      var = False
+    elif not user_input:
+      if enabled_by_default:
+        print(yes_reply)
+        var = True
+      else:
+        print(no_reply)
+        var = False
+    else:
+      print('Invalid selection: %s' % user_input_origin)
+  return var
+
+
+def set_build_var(environ_cp, var_name, query_item, option_name,
+                  enabled_by_default):
+  """Set if query_item will be enabled for the build.
+
+  Ask user if query_item will be enabled. Default is used if no input is given.
+  Set subprocess environment variable and write to .bazelrc if enabled.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    query_item: string for feature related to the variable, e.g. "Hadoop File
+      System".
+    option_name: string for option to define in .bazelrc.
+    enabled_by_default: boolean for default behavior.
+  """
+
+  var = str(int(get_var(environ_cp, var_name, query_item, enabled_by_default)))
+  environ_cp[var_name] = var
+  if var == '1':
+    write_to_bazelrc('build --define %s=true' % option_name)
+
+
+def set_action_env_var(environ_cp,
+                       var_name,
+                       query_item,
+                       enabled_by_default,
+                       question=None,
+                       yes_reply=None,
+                       no_reply=None):
+  """Set boolean action_env variable.
+
+  Ask user if query_item will be enabled. Default is used if no input is given.
+  Set environment variable and write to .bazelrc.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    query_item: string for feature related to the variable, e.g. "Hadoop File
+      System".
+    enabled_by_default: boolean for default behavior.
+    question: optional string for how to ask for user input.
+    yes_reply: optionanl string for reply when feature is enabled.
+    no_reply: optional string for reply when feature is disabled.
+  """
+  var = int(
+      get_var(environ_cp, var_name, query_item, enabled_by_default, question,
+              yes_reply, no_reply))
+
+  write_action_env_to_bazelrc(var_name, var)
+  environ_cp[var_name] = str(var)
+
+
+def check_bazel_version(min_version):
+  """Check installed bezel version is at least min_version.
+
+  Args:
+    min_version: string for minimum bazel version.
+  """
+  try:
+    curr_version = run_shell('bazel version')
+  except subprocess.CalledProcessError:
+    print('Cannot find bazel. Please install bazel.')
+    sys.exit(0)
+
+  for line in curr_version.split('\n'):
+    if 'Build label: ' in line:
+      curr_version = line.split('Build label: ')[1]
+      break
+
+  min_version_segments = min_version.split('.')
+  curr_version_segments = curr_version.split('.')
+
+  # Check if current bazel version can be detected properly.
+  for seg in curr_version_segments:
+    if not seg.isdigit():
+      print('WARNING: current bazel installation is not a release version.')
+      print('Make sure you are running at least bazel %s' % min_version)
+      return
+
+  min_version_str = ''.join(['%03d' % int(seg) for seg in min_version_segments])
+  curr_version_str = ''.join(
+      ['%03d' % int(seg) for seg in curr_version_segments])
+  if int(curr_version_str) < int(min_version_str):
+    print('Please upgrade your bazel installation to version %s or higher to '
+          'build TensorFlow!' % min_version)
+    sys.exit(0)
+
+
+def set_cc_opt_flags(environ_cp):
+  """Set up architecture-dependent optimization flags.
+
+  Also append CC optimization flags to bazel.rc..
+
+  Args:
+    environ_cp: copy of the os.environ.
+  """
+  if is_ppc64le():
+    # gcc on ppc64le does not support -march, use mcpu instead
+    default_cc_opt_flags = '-mcpu=native'
+  else:
+    default_cc_opt_flags = '-march=native'
+  question = ('Please specify optimization flags to use during compilation when'
+              ' bazel option "--config=opt" is specified [Default is %s]: '
+             ) % default_cc_opt_flags
+  cc_opt_flags = get_from_env_or_user_or_default(environ_cp, 'CC_OPT_FLAGS',
+                                                 question, default_cc_opt_flags)
+  for opt in cc_opt_flags.split():
+    write_to_bazelrc('build:opt --cxxopt=%s --copt=%s' % (opt, opt))
+
+
+def set_tf_cuda_clang(environ_cp):
+  """set TF_CUDA_CLANG action_env.
+
+  Args:
+    environ_cp: copy of the os.environ.
+  """
+  question = 'Do you want to use clang as CUDA compiler?'
+  yes_reply = 'Clang will be used as CUDA compiler.'
+  no_reply = 'nvcc will be used as CUDA compiler.'
+  set_action_env_var(
+      environ_cp,
+      'TF_CUDA_CLANG',
+      None,
+      False,
+      question=question,
+      yes_reply=yes_reply,
+      no_reply=no_reply)
+
+
+def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var,
+                                    var_default):
+  """Get var_name either from env, or user or default.
+
+  If var_name has been set as environment variable, use the preset value, else
+  ask for user input. If no input is provided, the default is used.
+
+  Args:
+    environ_cp: copy of the os.environ.
+    var_name: string for name of environment variable, e.g. "TF_NEED_HDFS".
+    ask_for_var: string for how to ask for user input.
+    var_default: default value string.
+
+  Returns:
+    string value for var_name
+  """
+  var = environ_cp.get(var_name)
+  if not var:
+    var = get_input(ask_for_var)
+  if not var:
+    var = var_default
+  return var
+
+
+def set_clang_cuda_compiler_path(environ_cp):
+  """Set CLANG_CUDA_COMPILER_PATH."""
+  default_clang_path = run_shell('which clang || true')
+  ask_clang_path = ('Please specify which clang should be used as device and '
+                    'host compiler. [Default is %s]: ') % default_clang_path
+
+  while True:
+    clang_cuda_compiler_path = get_from_env_or_user_or_default(
+        environ_cp, 'CLANG_CUDA_COMPILER_PATH', ask_clang_path,
+        default_clang_path)
+    if os.path.exists(clang_cuda_compiler_path):
+      break
+
+    # Reset and retry
+    print('Invalid clang path: %s cannot be found.' % clang_cuda_compiler_path)
+    environ_cp['CLANG_CUDA_COMPILER_PATH'] = ''
+
+  # Set CLANG_CUDA_COMPILER_PATH
+  environ_cp['CLANG_CUDA_COMPILER_PATH'] = clang_cuda_compiler_path
+  write_action_env_to_bazelrc('CLANG_CUDA_COMPILER_PATH',
+                              clang_cuda_compiler_path)
+
+
+def set_gcc_host_compiler_path(environ_cp):
+  """Set GCC_HOST_COMPILER_PATH."""
+  default_gcc_host_compiler_path = run_shell('which gcc || true')
+  cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH')
+
+  if os.path.islink(cuda_bin_symlink):
+    # os.readlink is only available in linux
+    default_gcc_host_compiler_path = run_shell('readlink %s' % cuda_bin_symlink)
+
+  ask_gcc_path = (
+      'Please specify which gcc should be used by nvcc as the '
+      'host compiler. [Default is %s]: ') % default_gcc_host_compiler_path
+  while True:
+    gcc_host_compiler_path = get_from_env_or_user_or_default(
+        environ_cp, 'GCC_HOST_COMPILER_PATH', ask_gcc_path,
+        default_gcc_host_compiler_path)
+
+    if os.path.exists(gcc_host_compiler_path):
+      break
+
+    # Reset and retry
+    print('Invalid gcc path. %s cannot be found' % gcc_host_compiler_path)
+    environ_cp['GCC_HOST_COMPILER_PATH'] = ''
+
+  # Set GCC_HOST_COMPILER_PATH
+  environ_cp['GCC_HOST_COMPILER_PATH'] = gcc_host_compiler_path
+  write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
+
+
+def set_tf_cuda_version(environ_cp):
+  """Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION."""
+  ask_cuda_version = (
+      'Please specify the CUDA SDK version you want to use, '
+      'e.g. 7.0. [Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
+
+  while True:
+    # Configure the Cuda SDK version to use.
+    tf_cuda_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_CUDA_VERSION', ask_cuda_version, _DEFAULT_CUDA_VERSION)
+
+    # Find out where the CUDA toolkit is installed
+    default_cuda_path = _DEFAULT_CUDA_PATH
+    if is_windows():
+      default_cuda_path = cygpath(
+          environ_cp.get('CUDA_PATH', _DEFAULT_CUDA_PATH_WIN))
+    elif is_linux():
+      # If the default doesn't exist, try an alternative default.
+      if (not os.path.exists(default_cuda_path)
+         ) and os.path.exists(_DEFAULT_CUDA_PATH_LINUX):
+        default_cuda_path = _DEFAULT_CUDA_PATH_LINUX
+    ask_cuda_path = ('Please specify the location where CUDA %s toolkit is'
+                     ' installed. Refer to README.md for more details. '
+                     '[Default is %s]: ') % (tf_cuda_version, default_cuda_path)
+    cuda_toolkit_path = get_from_env_or_user_or_default(
+        environ_cp, 'CUDA_TOOLKIT_PATH', ask_cuda_path, default_cuda_path)
+
+    if is_windows():
+      cuda_rt_lib_path = 'lib/x64/cudart.lib'
+    elif is_linux():
+      cuda_rt_lib_path = 'lib64/libcudart.so.%s' % tf_cuda_version
+    elif is_macos():
+      cuda_rt_lib_path = 'lib/libcudart.%s.dylib' % tf_cuda_version
+
+    cuda_toolkit_path_full = os.path.join(cuda_toolkit_path, cuda_rt_lib_path)
+    if os.path.exists(cuda_toolkit_path_full):
+      break
+
+    # Reset and retry
+    print('Invalid path to CUDA %s toolkit. %s cannot be found' %
+          (tf_cuda_version, cuda_toolkit_path_full))
+    environ_cp['TF_CUDA_VERSION'] = ''
+    environ_cp['CUDA_TOOLKIT_PATH'] = ''
+
+  # Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION
+  environ_cp['CUDA_TOOLKIT_PATH'] = cuda_toolkit_path
+  write_action_env_to_bazelrc('CUDA_TOOLKIT_PATH', cuda_toolkit_path)
+  environ_cp['TF_CUDA_VERSION'] = tf_cuda_version
+  write_action_env_to_bazelrc('TF_CUDA_VERSION', tf_cuda_version)
+
+
+def set_tf_cunn_version(environ_cp):
+  """Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION."""
+  ask_cudnn_version = (
+      '"Please specify the cuDNN version you want to use. '
+      '[Leave empty to default to cuDNN %s.0]: ') % _DEFAULT_CUDNN_VERSION
+
+  while True:
+    tf_cudnn_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_CUDNN_VERSION', ask_cudnn_version,
+        _DEFAULT_CUDNN_VERSION)
+
+    default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH')
+    ask_cudnn_path = (r'Please specify the location where cuDNN %s library is '
+                      'installed. Refer to README.md for more details. [Default'
+                      ' is %s]:') % (tf_cudnn_version, default_cudnn_path)
+    cudnn_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'CUDNN_INSTALL_PATH', ask_cudnn_path, default_cudnn_path)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    cudnn_install_path = os.path.realpath(
+        os.path.expanduser(cudnn_install_path))
+    if is_windows():
+      cudnn_install_path = cygpath(cudnn_install_path)
+
+    if is_windows():
+      cuda_dnn_lib_path = 'lib/x64/cudnn.lib'
+      cuda_dnn_lib_alt_path = 'lib/x64/cudnn.lib'
+    elif is_linux():
+      cuda_dnn_lib_path = 'lib64/libcudnn.so.%s' % tf_cudnn_version
+      cuda_dnn_lib_alt_path = 'libcudnn.so.%s' % tf_cudnn_version
+    elif is_macos():
+      cuda_dnn_lib_path = 'lib/libcudnn.%s.dylib' % tf_cudnn_version
+      cuda_dnn_lib_alt_path = 'libcudnn.%s.dylib' % tf_cudnn_version
+
+    cuda_dnn_lib_path_full = os.path.join(cudnn_install_path, cuda_dnn_lib_path)
+    cuda_dnn_lib_alt_path_full = os.path.join(cudnn_install_path,
+                                              cuda_dnn_lib_alt_path)
+    if os.path.exists(cuda_dnn_lib_path_full) or os.path.exists(
+        cuda_dnn_lib_alt_path_full):
+      break
+
+    # Try another alternative for Linux
+    if is_linux():
+      if subprocess.call(['which', 'ldconfig']):
+        ldconfig_bin = '/sbin/ldconfig'
+      else:
+        ldconfig_bin = 'ldconfig'
+      cudnn_path_from_ldconfig = run_shell(
+          r'%s -p | sed -n "s/.*libcudnn.so .* => \(.*\)/\\1/p"' % ldconfig_bin)
+      if os.path.exists('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version)):
+        cudnn_install_path = os.path.dirname(cudnn_path_from_ldconfig)
+        break
+
+    # Reset and Retry
+    print(
+        'Invalid path to cuDNN %s toolkit. None of the following files can be '
+        'found:' % tf_cudnn_version)
+    print(cuda_dnn_lib_path_full)
+    print(cuda_dnn_lib_alt_path_full)
+    if is_linux():
+      print('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version))
+
+    environ_cp['TF_CUDNN_VERSION'] = ''
+
+  # Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION
+  environ_cp['CUDNN_INSTALL_PATH'] = cudnn_install_path
+  write_action_env_to_bazelrc('CUDNN_INSTALL_PATH', cudnn_install_path)
+  environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version
+  write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
+
+
+def get_native_cuda_compute_capabilities(environ_cp):
+  """Get native cuda compute capabilities.
+
+  Args:
+    environ_cp: copy of the os.environ.
+  Returns:
+    string of native cuda compute capabilities, separated by comma.
+  """
+  device_query_bin = os.path.join(
+      environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery')
+  cmd = (r'"%s" | grep "Capability" | grep -o "[0-9]*\.[0-9]*" | sed '
+         '":a;{N;s/\\n/,/};ba"') % device_query_bin
+  try:
+    output = run_shell(cmd)
+  except subprocess.CalledProcessError:
+    output = ''
+  return output
+
+
+def set_tf_cuda_compute_capabilities(environ_cp):
+  """Set TF_CUDA_COMPUTE_CAPABILITIES."""
+  while True:
+    native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(
+        environ_cp)
+    if not native_cuda_compute_capabilities:
+      default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+    else:
+      default_cuda_compute_capabilities = native_cuda_compute_capabilities
+
+    ask_cuda_compute_capabilities = (
+        'Please specify a list of comma-separated '
+        'Cuda compute capabilities you want to '
+        'build with.\nYou can find the compute '
+        'capability of your device at: '
+        'https://developer.nvidia.com/cuda-gpus.\nPlease'
+        ' note that each additional compute '
+        'capability significantly increases your '
+        'build time and binary size. [Default is: %s]' %
+        default_cuda_compute_capabilities)
+    tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
+        environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
+        ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
+    # Check whether all capabilities from the input is valid
+    all_valid = True
+    for compute_capability in tf_cuda_compute_capabilities.split(','):
+      if not re.match('[0-9]+.[0-9]+', compute_capability):
+        print('Invalid compute capability: ' % compute_capability)
+        all_valid = False
+
+    if all_valid:
+      break
+
+    # Reset and Retry
+    environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = ''
+
+  # Set TF_CUDA_COMPUTE_CAPABILITIES
+  environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = tf_cuda_compute_capabilities
+  write_action_env_to_bazelrc('TF_CUDA_COMPUTE_CAPABILITIES',
+                              tf_cuda_compute_capabilities)
+
+
+def set_other_cuda_vars(environ_cp):
+  """Set other CUDA related variables."""
+  if is_windows():
+    # The following three variables are needed for MSVC toolchain configuration
+    # in Bazel
+    environ_cp['CUDA_PATH'] = environ_cp.get('CUDA_TOOLKIT_PATH')
+    environ_cp['CUDA_COMPUTE_CAPABILITIES'] = environ_cp.get(
+        'TF_CUDA_COMPUTE_CAPABILITIES')
+    environ_cp['NO_WHOLE_ARCHIVE_OPTION'] = 1
+    write_action_env_to_bazelrc('CUDA_PATH', environ_cp.get('CUDA_PATH'))
+    write_action_env_to_bazelrc('CUDA_COMPUTE_CAPABILITIE',
+                                environ_cp.get('CUDA_COMPUTE_CAPABILITIE'))
+    write_action_env_to_bazelrc('NO_WHOLE_ARCHIVE_OPTION',
+                                environ_cp.get('NO_WHOLE_ARCHIVE_OPTION'))
+    write_to_bazelrc('build --config=win-cuda')
+    write_to_bazelrc('test --config=win-cuda')
+  else:
+    # If CUDA is enabled, always use GPU during build and test.
+    if environ_cp.get('TF_CUDA_CLANG') == '1':
+      write_to_bazelrc('build --config=cuda_clang')
+      write_to_bazelrc('test --config=cuda_clang')
+    else:
+      write_to_bazelrc('build --config=cuda')
+      write_to_bazelrc('test --config=cuda')
+
+
+def set_host_cxx_compiler(environ_cp):
+  """Set HOST_CXX_COMPILER."""
+  default_cxx_host_compiler = run_shell('which g++ || true')
+  ask_cxx_host_compiler = (
+      'Please specify which C++ compiler should be used as'
+      ' the host C++ compiler. [Default is %s]: ') % default_cxx_host_compiler
+
+  while True:
+    host_cxx_compiler = get_from_env_or_user_or_default(
+        environ_cp, 'HOST_CXX_COMPILER', ask_cxx_host_compiler,
+        default_cxx_host_compiler)
+    if os.path.exists(host_cxx_compiler):
+      break
+
+    # Reset and retry
+    print('Invalid C++ compiler path. %s cannot be found' % host_cxx_compiler)
+    environ_cp['HOST_CXX_COMPILER'] = ''
+
+  # Set HOST_CXX_COMPILER
+  environ_cp['HOST_CXX_COMPILER'] = host_cxx_compiler
+  write_action_env_to_bazelrc('HOST_CXX_COMPILER', host_cxx_compiler)
+
+
+def set_host_c_compiler(environ_cp):
+  """Set HOST_C_COMPILER."""
+  default_c_host_compiler = run_shell('which gcc || true')
+  ask_c_host_compiler = (
+      'Please specify which C compiler should be used as the'
+      ' host C compiler. [Default is %s]: ') % default_c_host_compiler
+
+  while True:
+    host_c_compiler = get_from_env_or_user_or_default(
+        environ_cp, 'HOST_C_COMPILER', ask_c_host_compiler,
+        default_c_host_compiler)
+    if os.path.exists(host_c_compiler):
+      break
+
+    # Reset and retry
+    print('Invalid C compiler path. %s cannot be found' % host_c_compiler)
+    environ_cp['HOST_C_COMPILER'] = ''
+
+  # Set HOST_C_COMPILER
+  environ_cp['HOST_C_COMPILER'] = host_c_compiler
+  write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler)
+
+
+def set_computecpp_toolkit_path(environ_cp):
+  """Set COMPUTECPP_TOOLKIT_PATH."""
+  ask_computecpp_toolkit_path = ('Please specify the location where ComputeCpp '
+                                 'for SYCL %s is installed. [Default is %s]: '
+                                ) % (_TF_OPENCL_VERSION,
+                                     _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
+
+  while True:
+    computecpp_toolkit_path = get_from_env_or_user_or_default(
+        environ_cp, 'COMPUTECPP_TOOLKIT_PATH', ask_computecpp_toolkit_path,
+        _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
+    if is_linux():
+      sycl_rt_lib_path = 'lib/libComputeCpp.so'
+    else:
+      sycl_rt_lib_path = ''
+
+    sycl_rt_lib_path_full = os.path.join(computecpp_toolkit_path,
+                                         sycl_rt_lib_path)
+    if os.path.exists(sycl_rt_lib_path_full):
+      break
+
+    print('Invalid SYCL %s library path. %s cannot be found' %
+          (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
+    environ_cp['COMPUTECPP_TOOLKIT_PATH'] = ''
+
+  # Set COMPUTECPP_TOOLKIT_PATH
+  environ_cp['COMPUTECPP_TOOLKIT_PATH'] = computecpp_toolkit_path
+  write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
+                              computecpp_toolkit_path)
+
+
+def set_mpi_home(environ_cp):
+  """Set MPI_HOME."""
+  cmd = ('dirname $(dirname $(which mpirun)) || dirname $(dirname $(which '
+         'mpiexec))  || true')
+  default_mpi_home = run_shell(cmd)
+  ask_mpi_home = ('Please specify the MPI toolkit folder. [Default is %s]: '
+                 ) % default_mpi_home
+  while True:
+    mpi_home = get_from_env_or_user_or_default(environ_cp, 'MPI_HOME',
+                                               ask_mpi_home, default_mpi_home)
+
+    if os.path.exists(os.path.join(mpi_home, 'include')) and os.path.exists(
+        os.path.join(mpi_home, 'lib')):
+      break
+
+    print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
+          (os.path.join(mpi_home, 'include'),
+           os.path.exists(os.path.join(mpi_home, 'lib'))))
+    environ_cp['MPI_HOME'] = ''
+
+  # Set MPI_HOME
+  environ_cp['MPI_HOME'] = str(mpi_home)
+
+
+def set_other_mpi_vars(environ_cp):
+  """Set other MPI related variables."""
+  # Link the MPI header files
+  mpi_home = environ_cp.get('MPI_HOME')
+  symlink_force('%s/include/mpi.h' % mpi_home, 'third_party/mpi/mpi.h')
+
+  # Determine if we use OpenMPI or MVAPICH, these require different header files
+  # to be included here to make bazel dependency checker happy
+  if os.path.exists(os.path.join(mpi_home, 'include/mpi_portable_platform.h')):
+    symlink_force(
+        os.path.join(mpi_home, 'include/mpi_portable_platform.h'),
+        'third_party/mpi/mpi_portable_platform.h')
+    # TODO(gunan): avoid editing files in configure
+    sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI=False',
+                 'MPI_LIB_IS_OPENMPI=True')
+  else:
+    # MVAPICH / MPICH
+    symlink_force(
+        os.path.join(mpi_home, 'include/mpio.h'), 'third_party/mpi/mpio.h')
+    symlink_force(
+        os.path.join(mpi_home, 'include/mpicxx.h'), 'third_party/mpi/mpicxx.h')
+    # TODO(gunan): avoid editing files in configure
+    sed_in_place('third_party/mpi/mpi.bzl', 'MPI_LIB_IS_OPENMPI=True',
+                 'MPI_LIB_IS_OPENMPI=False')
+
+  if os.path.exists(os.path.join(mpi_home, 'lib/libmpi.so')):
+    symlink_force(
+        os.path.join(mpi_home, 'lib/libmpi.so'), 'third_party/mpi/libmpi.so')
+  else:
+    raise ValueError('Cannot find the MPI library file in %s/lib' % mpi_home)
+
+
+def set_mkl():
+  write_to_bazelrc('build:mkl --define with_mkl_support=true')
+  write_to_bazelrc('build:mkl --define using_mkl=true')
+  write_to_bazelrc('build:mkl -c opt')
+  write_to_bazelrc('build:mkl --copt="-DEIGEN_USE_VML"')
+  print(
+      'Add "--config=mkl" to your bazel command to build with MKL '
+      'support.\nPlease note that MKL on MacOS or windows is still not '
+      'supported.\nIf you would like to use a local MKL instead of '
+      'downloading, please set the environment variable \"TF_MKL_ROOT\" every '
+      'time before build.')
+
+
+def main():
+  # Make a copy of os.environ to be clear when functions and getting and setting
+  # environment variables.
+  environ_cp = dict(os.environ)
+
+  check_bazel_version('0.4.5')
+
+  reset_tf_configure_bazelrc()
+  cleanup_makefile()
+  setup_python(environ_cp)
+  run_gen_git_source(environ_cp)
+
+  if is_windows():
+    environ_cp['TF_NEED_GCP'] = '0'
+    environ_cp['TF_NEED_HDFS'] = '0'
+    environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_OPENCL'] = '0'
+    environ_cp['TF_CUDA_CLANG'] = '0'
+
+  if is_macos():
+    environ_cp['TF_NEED_JEMALLOC'] = '0'
+
+  set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
+                'with_jemalloc', True)
+  set_build_var(environ_cp, 'TF_NEED_GCP', 'Google Cloud Platform',
+                'with_gcp_support', False)
+  set_build_var(environ_cp, 'TF_NEED_HDFS', 'Hadoop File System',
+                'with_hdfs_support', False)
+  set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
+                False)
+  set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
+                False)
+
+  set_action_env_var(environ_cp, 'TF_NEED_OPENCL', 'OpenCL', False)
+  if environ_cp.get('TF_NEED_OPENCL') == '1':
+    set_host_cxx_compiler(environ_cp)
+    set_host_c_compiler(environ_cp)
+    set_computecpp_toolkit_path(environ_cp)
+
+  set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
+  if environ_cp.get('TF_NEED_CUDA') == '1':
+    set_tf_cuda_version(environ_cp)
+    set_tf_cunn_version(environ_cp)
+    set_tf_cuda_compute_capabilities(environ_cp)
+
+    set_tf_cuda_clang(environ_cp)
+    if environ_cp.get('TF_CUDA_CLANG') == '1':
+      # Set up which clang we should use as the cuda / host compiler.
+      set_clang_cuda_compiler_path(environ_cp)
+    else:
+      # Set up which gcc nvcc should use as the host compiler
+      # No need to set this on Windows
+      if not is_windows():
+        set_gcc_host_compiler_path(environ_cp)
+    set_other_cuda_vars(environ_cp)
+
+  set_build_var(environ_cp, 'TF_NEED_MPI', 'MPI', 'with_mpi_support', False)
+  if environ_cp.get('TF_NEED_MPI') == '1':
+    set_mpi_home(environ_cp)
+    set_other_mpi_vars(environ_cp)
+
+  set_cc_opt_flags(environ_cp)
+  set_mkl()
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured
index 25cb51ea7cc..563e07e3afb 100755
--- a/tensorflow/tools/ci_build/builds/configured
+++ b/tensorflow/tools/ci_build/builds/configured
@@ -56,7 +56,7 @@ else
 fi
 
 pushd "${CI_TENSORFLOW_SUBMODULE_PATH:-.}"
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 popd
 
 # Gather and print build information
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index f66846654d1..9a6890401b7 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -120,7 +120,7 @@ else
 fi
 
 export TF_NEED_CUDA=$IS_GPU
-yes "" | ./configure
+${PYTHON_BIN_PATH} configure.py
 
 # Figure out how many concurrent tests we can run and do run the tests.
 BAZEL_PARALLEL_TEST_FLAGS=""
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
index 118e85fee0b..ca840796543 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
@@ -30,7 +30,7 @@ export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 # Only running cc tests, python version does not matter.
 export PYTHON_BIN_PATH=`which python`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test --test_lang_filters=cc -k \
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
index fa3d27fa41e..5c82c9efafa 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
@@ -29,7 +29,7 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=`which python2`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index 258dec4fec8..7155636a53f 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -29,7 +29,7 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=`which python3`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
index 9c450ab4dab..218d2a89913 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
@@ -29,7 +29,7 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=`which python3`
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index f2ea8d3c773..dff72c25bf7 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -32,7 +32,7 @@ export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index 4e0c3d1d333..a36a8445afd 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -32,7 +32,7 @@ export PYTHON_BIN_PATH=`which python3`
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index 0b8c73993f8..0ee894e2c44 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -30,7 +30,7 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export PYTHON_BIN_PATH=$(which python2)
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
     --test_timeout 300,450,1200,3600 \
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index 11064130713..f548adc5ca8 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -33,7 +33,7 @@ export TF_NEED_CUDA=1
 export TF_ENABLE_XLA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
-yes "" | ./configure
+$PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test -k \

From 57d17092d0e2bd6f169724beab28ec29c5e6db85 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 25 Jul 2017 13:39:32 -0700
Subject: [PATCH 36/56] [TF:XLA] Ignore control edges from Enter nodes to the
 graph sink during loop functionalization.

PiperOrigin-RevId: 163115904
---
 .../tf2xla/functionalize_control_flow.cc      | 24 +++++++++++++++----
 .../tf2xla/functionalize_control_flow_test.cc |  8 +++++++
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index faa88ecfe2e..1c7a2046aa5 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -323,12 +323,26 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   for (Arg& arg : frame->args) {
     if (!arg.is_loop_invariant) {
       // Follow the edge from the Enter to Merge.
-      if (arg.enter->out_edges().size() != 1) {
-        return errors::Internal("Enter node for loop-varying argument ",
-                                arg.enter->name(),
-                                " does not have exactly one successor");
+      const Edge* enter_merge = nullptr;
+      for (const Edge* e : arg.enter->out_edges()) {
+        // Ignore control-edges to the sink node. These are allowed by the
+        // graph invariants, although probably they should have been stripped
+        // off earlier.
+        if (e->IsControlEdge() && e->dst()->IsSink()) {
+          continue;
+        }
+        if (enter_merge != nullptr) {
+          return errors::Internal(
+              "Enter node for loop-varying argument ", arg.enter->name(),
+              " has multiple successors: ", enter_merge->dst()->name(), " and ",
+              e->dst()->name());
+        }
+        enter_merge = e;
+      }
+      if (enter_merge == nullptr) {
+        return errors::Internal("Enter node for loop-varying argument ",
+                                arg.enter->name(), " has zero successors");
       }
-      const Edge* enter_merge = *arg.enter->out_edges().begin();
       arg.merge = enter_merge->dst();
       if (!IsMerge(arg.merge)) {
         return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 2fb1cc04543..914c8999a6f 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -96,6 +96,14 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
     TF_EXPECT_OK(scope.ToGraph(&graph));
   }
 
+  // Regression test: control edges from an Enter node to the graph sink should
+  // be ignored.
+  for (Node* n : graph.nodes()) {
+    if (n->name() == "while/Enter") {
+      graph.AddControlEdge(n, graph.sink_node());
+    }
+  }
+
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
 

From 27bbbd8d0149724caa7f1295f122f968acb8cd96 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 14:14:53 -0700
Subject: [PATCH 37/56] Support customized residual function in the residual
 wrapper.

PiperOrigin-RevId: 163121296
---
 .../python/kernel_tests/core_rnn_cell_test.py | 29 +++++++++++++++++--
 tensorflow/python/ops/rnn_cell_impl.py        | 18 ++++++++----
 ...orflow.nn.rnn_cell.-residual-wrapper.pbtxt |  2 +-
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 06954f51d8e..c14463bdad2 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -210,7 +210,7 @@ class RNNCellTest(test.TestCase):
           sess.run([variables_lib.global_variables_initializer()])
           sess.run([g, out_m],
                    {x.name: 1 * np.ones([batch_size, input_size]),
-               m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+                    m.name: 0.1 * np.ones([batch_size - 1, state_size])})
 
   def testBasicLSTMCellStateSizeError(self):
     """Tests that state_size must be num_units * 2."""
@@ -218,7 +218,7 @@ class RNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         num_units = 2
-        state_size = num_units * 3 # state_size must be num_units * 2
+        state_size = num_units * 3  # state_size must be num_units * 2
         batch_size = 3
         input_size = 4
         x = array_ops.zeros([batch_size, input_size])
@@ -406,6 +406,31 @@ class RNNCellTest(test.TestCase):
         # States are left untouched
         self.assertAllClose(res[2], res[3])
 
+  def testResidualWrapperWithSlice(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 5])
+        m = array_ops.zeros([1, 3])
+        base_cell = rnn_cell_impl.GRUCell(3)
+        g, m_new = base_cell(x, m)
+        variable_scope.get_variable_scope().reuse_variables()
+        def residual_with_slice_fn(inp, out):
+          inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
+          return inp_sliced + out
+        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
+            base_cell, residual_with_slice_fn)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res_g, res_g_res, res_m_new, res_m_new_res = sess.run(
+            [g, g_res, m_new, m_new_res], {
+                x: np.array([[1., 1., 1., 1., 1.]]),
+                m: np.array([[0.1, 0.1, 0.1]])
+            })
+        # Residual connections
+        self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
+        # States are left untouched
+        self.assertAllClose(res_m_new, res_m_new_res)
+
   def testDeviceWrapper(self):
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index f7854e86c0c..304b6ae665f 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -786,13 +786,18 @@ class DropoutWrapper(RNNCell):
 class ResidualWrapper(RNNCell):
   """RNNCell wrapper that ensures cell inputs are added to the outputs."""
 
-  def __init__(self, cell):
+  def __init__(self, cell, residual_fn=None):
     """Constructs a `ResidualWrapper` for `cell`.
 
     Args:
       cell: An instance of `RNNCell`.
+      residual_fn: (Optional) The function to map raw cell inputs and raw cell
+        outputs to the actual cell outputs of the residual network.
+        Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
+        and outputs.
     """
     self._cell = cell
+    self._residual_fn = residual_fn
 
   @property
   def state_size(self):
@@ -807,7 +812,7 @@ class ResidualWrapper(RNNCell):
       return self._cell.zero_state(batch_size, dtype)
 
   def __call__(self, inputs, state, scope=None):
-    """Run the cell and add its inputs to its outputs.
+    """Run the cell and then apply the residual_fn on its inputs to its outputs.
 
     Args:
       inputs: cell inputs.
@@ -822,13 +827,14 @@ class ResidualWrapper(RNNCell):
       ValueError: If cell inputs and outputs have different structure (value).
     """
     outputs, new_state = self._cell(inputs, state, scope=scope)
-    nest.assert_same_structure(inputs, outputs)
     # Ensure shapes match
     def assert_shape_match(inp, out):
       inp.get_shape().assert_is_compatible_with(out.get_shape())
-    nest.map_structure(assert_shape_match, inputs, outputs)
-    res_outputs = nest.map_structure(
-        lambda inp, out: inp + out, inputs, outputs)
+    def default_residual_fn(inputs, outputs):
+      nest.assert_same_structure(inputs, outputs)
+      nest.map_structure(assert_shape_match, inputs, outputs)
+      return nest.map_structure(lambda inp, out: inp + out, inputs, outputs)
+    res_outputs = (self._residual_fn or default_residual_fn)(inputs, outputs)
     return (res_outputs, new_state)
 
 
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index b21d9a8ee33..a75e9e80802 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -54,7 +54,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'cell\', \'residual_fn\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"

From 6b3751f660dfa0675cc39d163b8224f2c070694e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 14:55:50 -0700
Subject: [PATCH 38/56] Make fft_length optional for inverse_stft

PiperOrigin-RevId: 163127500
---
 .../contrib/signal/python/ops/spectral_ops.py | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/contrib/signal/python/ops/spectral_ops.py
index 75bc0bd21d5..950d8f471c6 100644
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ b/tensorflow/contrib/signal/python/ops/spectral_ops.py
@@ -103,7 +103,7 @@ def stft(signals, frame_length, frame_step, fft_length=None,
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
-                 fft_length,
+                 fft_length=None,
                  window_fn=functools.partial(window_ops.hann_window,
                                              periodic=True),
                  name=None):
@@ -118,7 +118,8 @@ def inverse_stft(stfts,
     frame_length: An integer scalar `Tensor`. The window length in samples.
     frame_step: An integer scalar `Tensor`. The number of samples to step.
     fft_length: An integer scalar `Tensor`. The size of the FFT that produced
-      `stfts`.
+      `stfts`. If not provided, uses the smallest power of 2 enclosing
+      `frame_length`.
     window_fn: A callable that takes a window length and a `dtype` keyword
       argument and returns a `[window_length]` `Tensor` of samples in the
       provided datatype. If set to `None`, no windowing is used.
@@ -130,7 +131,8 @@ def inverse_stft(stfts,
 
   Raises:
     ValueError: If `stfts` is not at least rank 2, `frame_length` is not scalar,
-      `frame_step` is not scalar, or `fft_length` is not scalar.
+      `frame_step` is not scalar, or `fft_length` is not scalar, or
+      `frame_length` is greater than `fft_length`.
 
   [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
   """
@@ -141,8 +143,21 @@ def inverse_stft(stfts,
     frame_length.shape.assert_has_rank(0)
     frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
     frame_step.shape.assert_has_rank(0)
-    fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
-    fft_length.shape.assert_has_rank(0)
+    if fft_length is None:
+      fft_length = _enclosing_power_of_two(frame_length)
+    else:
+      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
+      fft_length.shape.assert_has_rank(0)
+
+    frame_length_static = tensor_util.constant_value(
+        frame_length)
+    fft_length_static = tensor_util.constant_value(fft_length)
+    if (frame_length_static is not None and fft_length_static is not None and
+        frame_length_static > fft_length_static):
+      raise ValueError('frame_length (%d) may not be larger than '
+                       'fft_length (%d)' % (frame_length_static,
+                                            fft_length_static))
+
     real_frames = spectral_ops.irfft(stfts, [fft_length])[..., :frame_length]
 
     # Optionally window and overlap-add the inner 2 dimensions of real_frames

From 07249f08867369899d39fc60442febdf1e36e6b5 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Tue, 25 Jul 2017 15:03:34 -0700
Subject: [PATCH 39/56] Add LocalTempFilename function to Env class which
 creates a local temp file name

PiperOrigin-RevId: 163128673
---
 tensorflow/core/platform/env.cc      | 91 ++++++++++++++++++++++++++++
 tensorflow/core/platform/env.h       |  6 ++
 tensorflow/core/platform/env_test.cc | 28 +++++++++
 3 files changed, 125 insertions(+)

diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 2fdd989c9b9..568a22b295c 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <sys/stat.h>
 #include <deque>
 #include <utility>
 #include <vector>
@@ -30,7 +31,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -273,6 +277,93 @@ string Env::GetExecutablePath() {
   return exe_path;
 }
 
+bool Env::LocalTempFilename(string* filename) {
+  std::vector<string> dirs;
+  GetLocalTempDirectories(&dirs);
+
+  // Try each directory, as they might be full, have inappropriate
+  // permissions or have different problems at times.
+  for (const string& dir : dirs) {
+#ifdef __APPLE__
+    uint64_t tid64;
+    pthread_threadid_np(nullptr, &tid64);
+    int32 tid = static_cast<int32>(tid64);
+    int32 pid = static_cast<int32>(getpid());
+#elif defined(PLATFORM_WINDOWS)
+    int32 tid = static_cast<int32>(GetCurrentThreadId());
+    int32 pid = static_cast<int32>(GetCurrentProcessId());
+#else
+    int32 tid = static_cast<int32>(pthread_self());
+    int32 pid = static_cast<int32>(getpid());
+#endif
+    uint64 now_microsec = NowMicros();
+
+    *filename = io::JoinPath(
+        dir, strings::Printf("tempfile-%s-%x-%d-%llx", port::Hostname().c_str(),
+                             tid, pid, now_microsec));
+    if (FileExists(*filename).ok()) {
+      filename->clear();
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+void Env::GetLocalTempDirectories(std::vector<string>* list) {
+  list->clear();
+#ifdef PLATFORM_WINDOWS
+  // On windows we'll try to find a directory in this order:
+  //   C:/Documents & Settings/whomever/TEMP (or whatever GetTempPath() is)
+  //   C:/TMP/
+  //   C:/TEMP/
+  //   C:/WINDOWS/ or C:/WINNT/
+  //   .
+  char tmp[MAX_PATH];
+  // GetTempPath can fail with either 0 or with a space requirement > bufsize.
+  // See http://msdn.microsoft.com/en-us/library/aa364992(v=vs.85).aspx
+  DWORD n = GetTempPathA(MAX_PATH, tmp);
+  if (n > 0 && n <= MAX_PATH) list->push_back(tmp);
+  list->push_back("C:\\tmp\\");
+  list->push_back("C:\\temp\\");
+#else
+  // Directories, in order of preference. If we find a dir that
+  // exists, we stop adding other less-preferred dirs
+  const char* candidates[] = {
+      // Non-null only during unittest/regtest
+      getenv("TEST_TMPDIR"),
+
+      // Explicitly-supplied temp dirs
+      getenv("TMPDIR"),
+      getenv("TMP"),
+
+      // The old classic tmpdir
+      "/export/hda3/tmp",
+
+      // If all else fails
+      "/tmp",
+  };
+
+  for (const char* d : candidates) {
+    if (!d || d[0] == '\0') continue;  // Empty env var
+
+    // Make sure we don't surprise anyone who's expecting a '/'
+    string dstr = d;
+    if (dstr[dstr.size() - 1] != '/') {
+      dstr += "/";
+    }
+
+    struct stat statbuf;
+    if (!stat(d, &statbuf) && S_ISDIR(statbuf.st_mode) &&
+        !access(dstr.c_str(), 0)) {
+      // We found a dir that exists and is accessible - we're done.
+      list->push_back(dstr);
+      return;
+    }
+  }
+#endif
+}
+
 Thread::~Thread() {}
 
 EnvWrapper::~EnvWrapper() {}
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 1b7e024b0f4..da8c3e2d7e8 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -215,6 +215,9 @@ class Env {
   /// symlinks if there is any.
   string GetExecutablePath();
 
+  /// Creates a local unique temporary file name. Returns true if success.
+  bool LocalTempFilename(string* filename);
+
   // TODO(jeff,sanjay): Add back thread/thread-pool support if needed.
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
@@ -279,6 +282,9 @@ class Env {
       const string& version) = 0;
 
  private:
+  // Returns a possible list of local temporary directories.
+  void GetLocalTempDirectories(std::vector<string>* list);
+
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
   TF_DISALLOW_COPY_AND_ASSIGN(Env);
   EnvTime* envTime = EnvTime::Default();
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 7bc1882c86d..50dd0cd58b8 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -298,4 +298,32 @@ TEST_F(DefaultEnvTest, GetExecutablePath) {
   TF_EXPECT_OK(env->FileExists(env->GetExecutablePath()));
 }
 
+TEST_F(DefaultEnvTest, LocalTempFilename) {
+  Env* env = Env::Default();
+  string filename;
+  EXPECT_TRUE(env->LocalTempFilename(&filename));
+  EXPECT_FALSE(env->FileExists(filename).ok());
+
+  // Write something to the temporary file.
+  std::unique_ptr<WritableFile> file_to_write;
+  TF_CHECK_OK(env->NewWritableFile(filename, &file_to_write));
+  TF_CHECK_OK(file_to_write->Append("Null"));
+  TF_CHECK_OK(file_to_write->Close());
+  TF_CHECK_OK(env->FileExists(filename));
+
+  // Read from the temporary file and check content.
+  std::unique_ptr<RandomAccessFile> file_to_read;
+  TF_CHECK_OK(env->NewRandomAccessFile(filename, &file_to_read));
+  StringPiece content;
+  char scratch[1024];
+  CHECK_EQ(error::OUT_OF_RANGE,
+           file_to_read->Read(0 /* offset */, 1024 /* n */, &content, scratch)
+               .code());
+  EXPECT_EQ("Null", content.ToString());
+
+  // Delete the temporary file.
+  TF_CHECK_OK(env->DeleteFile(filename));
+  EXPECT_FALSE(env->FileExists(filename).ok());
+}
+
 }  // namespace tensorflow

From 136494d3295a23e3ed0612773f224243915463b7 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 25 Jul 2017 15:25:13 -0700
Subject: [PATCH 40/56] Prune trivial ops (concatenation of a single tensor,
 AddN of a single tensor, ...)

PiperOrigin-RevId: 163131793
---
 .../trivial_test_graph_input_yielder.cc       | 14 +++-
 tensorflow/core/grappler/op_types.cc          |  5 ++
 tensorflow/core/grappler/op_types.h           |  1 +
 .../core/grappler/optimizers/model_pruner.cc  | 27 +++++++-
 .../grappler/optimizers/model_pruner_test.cc  | 66 +++++++++++++++----
 5 files changed, 93 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index 446ae2df643..b1ec35e2687 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -48,9 +48,17 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
   for (int i = 0; i < num_stages; i++) {
     std::vector<Output> this_stage;
     for (int j = 0; j < width; j++) {
-      Output combine = AddN(
-          s.WithDevice(device_names[use_multiple_devices ? j : 0]), last_stage);
-      this_stage.push_back(combine);
+      if (last_stage.size() == 1) {
+        Output unary_op =
+            Square(s.WithDevice(device_names[use_multiple_devices ? j : 0]),
+                   last_stage[0]);
+        this_stage.push_back(unary_op);
+      } else {
+        Output combine =
+            AddN(s.WithDevice(device_names[use_multiple_devices ? j : 0]),
+                 last_stage);
+        this_stage.push_back(combine);
+      }
     }
     last_stage = this_stage;
   }
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 9b2584f970b..8584681220d 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -18,6 +18,11 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+bool IsAddN(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "AddN";
+}
+
 bool IsConcat(const NodeDef& node) {
   const auto op = node.op();
   return op == "Concat" || op == "ConcatV2";
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 9c9dd22e2c9..d83cb777ed5 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -21,6 +21,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+bool IsAddN(const NodeDef& node);
 bool IsConcat(const NodeDef& node);
 bool IsConstant(const NodeDef& node);
 bool IsDequeueOp(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index df9aca8aa31..e313155563a 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -26,6 +26,29 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+int NumNonControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  for (int i = 0; i < node.input_size(); ++i) {
+    if (!node.input(i).empty() && node.input(i)[0] == '^') {
+      num_inputs--;
+    }
+  }
+  return num_inputs;
+}
+
+bool IsTrivialOp(const NodeDef& node) {
+  // Remove the stop gradient nodes since they serve no purpose once the graph
+  // is built. Also remove Identity ops.
+  if (IsStopGradient(node) || IsIdentity(node)) {
+    return true;
+  }
+  if (IsAddN(node) && NumNonControlInputs(node) <= 1) {
+    return true;
+  }
+
+  return false;
+}
+
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* pruned_graph) {
   GraphRewriter rewriter(item);
@@ -43,9 +66,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   std::unordered_set<const NodeDef*> nodes_to_delete;
   for (auto& node : item.graph.node()) {
-    // Remove the stop gradient nodes since they serve no purpose once the graph
-    // is built. Also remove Identity ops.
-    if (!IsStopGradient(node) && !IsIdentity(node)) {
+    if (!IsTrivialOp(node)) {
       continue;
     }
     // Don't remove nodes that must be preserved.
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index fdfb3f41cf1..72d9c7bf275 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -57,10 +57,10 @@ TEST_F(ModelPrunerTest, StopGradientPruning) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::StopGradient(s.WithOpName("c"), b);
   Output d = ops::StopGradient(s.WithOpName("d"), c);
-  Output e = ops::AddN(s.WithOpName("e"), {d});
+  Output e = ops::Sqrt(s.WithOpName("e"), {d});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -93,10 +93,10 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), b);
   Output d = ops::Identity(s.WithOpName("d"), c);
-  Output e = ops::AddN(s.WithOpName("e"), {d});
+  Output e = ops::Sqrt(s.WithOpName("e"), {d});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -126,15 +126,53 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   EXPECT_EQ(NodeName(b.name()), new_c.input(0));
 }
 
-TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
+TEST_F(ModelPrunerTest, NoOpPruning) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
   Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::AddN(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(b), {c});
+  Output e = ops::AddN(s.WithOpName("e"), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ(NodeName(a.name()), new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ(NodeName(b.name()), new_b.name());
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+
+  EXPECT_EQ(1, new_e.input_size());
+  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
+  EXPECT_EQ(2, new_d.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+  EXPECT_EQ(1, new_c.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_c.input(0));
+}
+
+TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
+  // Build a simple graph with a few trivially prunable ops.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), b);
   Output d = ops::Identity(s.WithOpName("d"), c);
-  Output e = ops::AddN(s.WithOpName("e").WithControlDependencies(c), {d});
+  Output e = ops::Sqrt(s.WithOpName("e").WithControlDependencies(c), {d});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -166,11 +204,11 @@ TEST_F(ModelPrunerTest, PruningPerservesCtrlDependencies) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
-  Output c = ops::AddN(s.WithOpName("c"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
+  Output c = ops::Sqrt(s.WithOpName("c"), {a});
   Output d = ops::Identity(s.WithOpName("d"), c);
   Output e = ops::Identity(s.WithOpName("e"), d);
-  Output f = ops::AddN(s.WithOpName("f"), {e});
+  Output f = ops::Sqrt(s.WithOpName("f"), {e});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -216,7 +254,7 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), b);
 
   GrapplerItem item;
@@ -243,13 +281,13 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
 
   // Node i1 should be preserved.
   Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/gpu:0"), c);
-  Output a1 = ops::AddN(s.WithOpName("a1").WithDevice("/gpu:0"), {i1});
-  Output a2 = ops::AddN(s.WithOpName("a2").WithDevice("/gpu:0"), {i1});
+  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/gpu:0"), {i1});
+  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/gpu:0"), {i1});
 
   // Node i2 should be pruned since it resides on the sender's device.
   Output i2 = ops::Identity(s.WithOpName("i2").WithDevice("/cpu:0"), c);
-  Output a3 = ops::AddN(s.WithOpName("a3").WithDevice("/gpu:0"), {i2});
-  Output a4 = ops::AddN(s.WithOpName("a4").WithDevice("/gpu:0"), {i2});
+  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/gpu:0"), {i2});
+  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/gpu:0"), {i2});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));

From ff967763fdccdbd51cfe50b7669c3baf86f2fa17 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 16:03:55 -0700
Subject: [PATCH 41/56] Add GDN activation function.

PiperOrigin-RevId: 163137487
---
 .../contrib/layers/python/layers/layers.py    | 302 +++++++++++++++++-
 .../layers/python/layers/layers_test.py       |  50 +++
 2 files changed, 350 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 8b3ccea9953..ff7545bb000 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,13 +32,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base
 from tensorflow.python.layers import convolutional as convolutional_layers
 from tensorflow.python.layers import core as core_layers
-from tensorflow.python.layers import  normalization as normalization_layers
+from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.layers import pooling as pooling_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
@@ -1281,7 +1285,7 @@ def convolution3d_transpose(
     trainable=True,
     scope=None):
   """Adds a convolution3d_transpose with an optional batch normalization layer.
-  
+
   The function creates a variable called `weights`, representing the
   kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
   second variable called 'biases' is added to the result of the operation.
@@ -1808,6 +1812,300 @@ def layer_norm(inputs,
                                        outputs)
 
 
+class GDN(base.Layer):
+  """Generalized divisive normalization layer.
+
+  Based on the papers:
+
+    "Density Modeling of Images using a Generalized Normalization
+    Transformation"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1511.06281
+
+    "End-to-end Optimized Image Compression"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1611.01704
+
+  Implements an activation function that is essentially a multivariate
+  generalization of a particular sigmoid-type function:
+
+  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
+
+  where i and j run over channels. This implementation never sums across spatial
+  dimensions. It is similar to local response normalization, but more powerful,
+  as beta and gamma are trainable parameters.
+
+  Arguments:
+    inverse: If False (default), compute GDN response. If True, compute IGDN
+      response (one step of fixed point iteration to invert GDN; the division
+      is replaced by multiplication).
+    beta_min: Lower bound for beta, to prevent numerical error from causing
+      square root of zero or negative values.
+    gamma_init: The gamma matrix will be initialized as the identity matrix
+      multiplied with this value. If set to zero, the layer is effectively
+      initialized to the identity operation, since beta is initialized as one.
+      A good default setting is somewhere between 0 and 0.5.
+    reparam_offset: Offset added to the reparameterization of beta and gamma.
+      The reparameterization of beta and gamma as their square roots lets the
+      training slow down when their values are close to zero, which is desirable
+      as small values in the denominator can lead to a situation where gradient
+      noise on beta/gamma leads to extreme amounts of noise in the GDN
+      activations. However, without the offset, we would get zero gradients if
+      any elements of beta or gamma were exactly zero, and thus the training
+      could get stuck. To prevent this, we add this small constant. The default
+      value was empirically determined as a good starting point. Making it
+      bigger potentially leads to more gradient noise on the activations, making
+      it too small may lead to numerical precision issues.
+    data_format: Format of input tensor. Currently supports 'channels_first' and
+      'channels_last'.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such cases.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Properties:
+    inverse: Boolean, whether GDN is computed (True) or IGDN (False).
+    data_format: Format of input tensor. Currently supports 'channels_first' and
+      'channels_last'.
+    beta: The beta parameter as defined above (1D TensorFlow tensor).
+    gamma: The gamma parameter as defined above (2D TensorFlow tensor).
+  """
+
+  def __init__(self,
+               inverse=False,
+               beta_min=1e-6,
+               gamma_init=.1,
+               reparam_offset=2 ** -18,
+               data_format='channels_last',
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(GDN, self).__init__(trainable=trainable, name=name, **kwargs)
+    self.inverse = inverse
+    self._beta_min = beta_min
+    self._gamma_init = gamma_init
+    self._reparam_offset = reparam_offset
+    self.data_format = data_format
+    self._channel_axis()  # trigger ValueError early
+    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
+
+  def _channel_axis(self):
+    try:
+      return {'channels_first': 1, 'channels_last': -1}[self.data_format]
+    except KeyError:
+      raise ValueError('Unsupported `data_format` for GDN layer: {}.'.format(
+          self.data_format))
+
+  @staticmethod
+  def _lower_bound(inputs, bound, name=None):
+    """Same as tf.maximum, but with helpful gradient for inputs < bound.
+
+    The gradient is overwritten so that it is passed through if the input is not
+    hitting the bound. If it is, only gradients that push `inputs` higher than
+    the bound are passed through. No gradients are passed through to the bound.
+
+    Args:
+      inputs: input tensor
+      bound: lower bound for the input tensor
+      name: name for this op
+
+    Returns:
+      tf.maximum(inputs, bound)
+    """
+    with ops.name_scope(name, 'GDNLowerBound', [inputs, bound]) as scope:
+      inputs = ops.convert_to_tensor(inputs, name='inputs')
+      bound = ops.convert_to_tensor(bound, name='bound')
+      with ops.get_default_graph().gradient_override_map(
+          {'Maximum': 'GDNLowerBound'}):
+        return math_ops.maximum(inputs, bound, name=scope)
+
+  @ops.RegisterGradient('GDNLowerBound')
+  @staticmethod
+  def _lower_bound_grad(op, grad):
+    """Gradient for `_lower_bound`.
+
+    Args:
+      op: the tensorflow op for which to calculate a gradient
+      grad: gradient with respect to the output of the op
+
+    Returns:
+      gradients with respect to the inputs of the op
+    """
+    inputs = op.inputs[0]
+    bound = op.inputs[1]
+    pass_through_if = math_ops.logical_or(inputs >= bound, grad < 0)
+    return [math_ops.cast(pass_through_if, grad.dtype) * grad, None]
+
+  def build(self, input_shape):
+    channel_axis = self._channel_axis()
+    input_shape = tensor_shape.TensorShape(input_shape)
+    num_channels = input_shape[channel_axis].value
+    if num_channels is None:
+      raise ValueError('The channel dimension of the inputs to `GDN` '
+                       'must be defined.')
+    self._input_rank = input_shape.ndims
+    self.input_spec = base.InputSpec(ndim=input_shape.ndims,
+                                     axes={channel_axis: num_channels})
+
+    pedestal = array_ops.constant(self._reparam_offset ** 2, dtype=self.dtype)
+    beta_bound = array_ops.constant(
+        (self._beta_min + self._reparam_offset ** 2) ** .5, dtype=self.dtype)
+    gamma_bound = array_ops.constant(self._reparam_offset, dtype=self.dtype)
+
+    def beta_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      return math_ops.sqrt(array_ops.ones(shape, dtype=dtype) + pedestal)
+
+    def gamma_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      assert len(shape) == 2
+      assert shape[0] == shape[1]
+      eye = linalg_ops.eye(shape[0], dtype=dtype)
+      return math_ops.sqrt(self._gamma_init * eye + pedestal)
+
+    beta = self.add_variable('reparam_beta',
+                             shape=[num_channels],
+                             initializer=beta_initializer,
+                             dtype=self.dtype,
+                             trainable=True)
+    beta = self._lower_bound(beta, beta_bound)
+    self.beta = math_ops.square(beta) - pedestal
+
+    gamma = self.add_variable('reparam_gamma',
+                              shape=[num_channels, num_channels],
+                              initializer=gamma_initializer,
+                              dtype=self.dtype,
+                              trainable=True)
+    gamma = self._lower_bound(gamma, gamma_bound)
+    self.gamma = math_ops.square(gamma) - pedestal
+
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    ndim = self._input_rank
+
+    shape = self.gamma.get_shape().as_list()
+    gamma = array_ops.reshape(self.gamma, (ndim - 2) * [1] + shape)
+
+    # Compute normalization pool.
+    if self.data_format == 'channels_first':
+      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID',
+                                 data_format='NC' + 'DHW'[-(ndim - 2):])
+      if ndim == 3:
+        norm_pool = array_ops.expand_dims(norm_pool, 2)
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+        norm_pool = array_ops.squeeze(norm_pool, [2])
+      elif ndim == 5:
+        shape = array_ops.shape(norm_pool)
+        norm_pool = array_ops.reshape(norm_pool, shape[:3] + [-1])
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+        norm_pool = array_ops.reshape(norm_pool, shape)
+      else:  # ndim == 4
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+    else:  # channels_last
+      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID')
+      norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NHWC')
+    norm_pool = math_ops.sqrt(norm_pool)
+
+    if self.inverse:
+      outputs = inputs * norm_pool
+    else:
+      outputs = inputs / norm_pool
+    outputs.set_shape(inputs.get_shape())
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    channel_axis = self._channel_axis()
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not 3 <= input_shape.ndim <= 5:
+      raise ValueError('`input_shape` must be of rank 3 to 5, inclusive.')
+    if input_shape[channel_axis].value is None:
+      raise ValueError(
+          'The channel dimension of `input_shape` must be defined.')
+    return input_shape
+
+
+def gdn(inputs,
+        inverse=False,
+        beta_min=1e-6,
+        gamma_init=.1,
+        reparam_offset=2 ** -18,
+        data_format='channels_last',
+        trainable=True,
+        name=None,
+        reuse=None):
+  """Functional interface for GDN layer.
+
+  Based on the papers:
+
+    "Density Modeling of Images using a Generalized Normalization
+    Transformation"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1511.06281
+
+    "End-to-end Optimized Image Compression"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1611.01704
+
+  Implements an activation function that is essentially a multivariate
+  generalization of a particular sigmoid-type function:
+
+  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
+
+  where i and j run over channels. This implementation never sums across spatial
+  dimensions. It is similar to local response normalization, but more powerful,
+  as beta and gamma are trainable parameters.
+
+  Arguments:
+    inputs: Tensor input.
+    inverse: If False (default), compute GDN response. If True, compute IGDN
+      response (one step of fixed point iteration to invert GDN; the division
+      is replaced by multiplication).
+    beta_min: Lower bound for beta, to prevent numerical error from causing
+      square root of zero or negative values.
+    gamma_init: The gamma matrix will be initialized as the identity matrix
+      multiplied with this value. If set to zero, the layer is effectively
+      initialized to the identity operation, since beta is initialized as one.
+      A good default setting is somewhere between 0 and 0.5.
+    reparam_offset: Offset added to the reparameterization of beta and gamma.
+      The reparameterization of beta and gamma as their square roots lets the
+      training slow down when their values are close to zero, which is desirable
+      as small values in the denominator can lead to a situation where gradient
+      noise on beta/gamma leads to extreme amounts of noise in the GDN
+      activations. However, without the offset, we would get zero gradients if
+      any elements of beta or gamma were exactly zero, and thus the training
+      could get stuck. To prevent this, we add this small constant. The default
+      value was empirically determined as a good starting point. Making it
+      bigger potentially leads to more gradient noise on the activations, making
+      it too small may lead to numerical precision issues.
+    data_format: Format of input tensor. Currently supports 'channels_first' and
+      'channels_last'.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such cases.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = GDN(inverse=inverse,
+              beta_min=beta_min,
+              gamma_init=gamma_init,
+              reparam_offset=reparam_offset,
+              data_format=data_format,
+              trainable=trainable,
+              name=name,
+              dtype=inputs.dtype.base_dtype,
+              _scope=name,
+              _reuse=reuse)
+  return layer.apply(inputs)
+
+
 @add_arg_scope
 def max_pool2d(inputs,
                kernel_size,
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 2d08a0cb91a..8867e069d1f 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -2772,6 +2772,56 @@ class LayerNormTest(test.TestCase):
     self.doOutputTest((1, 100, 100, 1))
 
 
+class GDNTest(test.TestCase):
+
+  def _runGDN(self, x, shape, inverse, data_format):
+    inputs = array_ops.placeholder(dtypes.float32, shape)
+    outputs = _layers.gdn(inputs, inverse=inverse, data_format=data_format)
+    with self.test_session() as sess:
+      variables_lib.global_variables_initializer().run()
+      y, = sess.run([outputs], {inputs: x})
+    return y
+
+  def testInvalidDataFormat(self):
+    x = np.random.uniform(size=(1, 2, 3, 4))
+    with self.assertRaises(ValueError):
+      self._runGDN(x, x.shape, False, 'NHWC')
+
+  def testUnknownDim(self):
+    x = np.random.uniform(size=(1, 2, 3, 4))
+    with self.assertRaises(ValueError):
+      self._runGDN(x, 4 * [None], False, 'channels_last')
+
+  def testChannelsLast(self):
+    for ndim in [3, 4, 5]:
+      x = np.random.uniform(size=(1, 2, 3, 4)[:ndim])
+      y = self._runGDN(x, x.shape, False, 'channels_last')
+      self.assertEqual(x.shape, y.shape)
+      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+
+  def testChannelsFirst(self):
+    # `bias_add` doesn't support NCHW on CPU.
+    if test.is_gpu_available(cuda_only=True):
+      for ndim in [3, 4, 5]:
+        x = np.random.uniform(size=(4, 3, 2, 1)[:ndim])
+        y = self._runGDN(x, x.shape, False, 'channels_first')
+        self.assertEqual(x.shape, y.shape)
+        self.assertAllClose(
+            y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+
+  def testWrongDims(self):
+    for ndim in [1, 2, 6]:
+      x = np.random.uniform(size=(1, 2, 3, 4, 3, 2)[:ndim])
+      with self.assertRaises(ValueError):
+        self._runGDN(x, x.shape, False, 'channels_last')
+
+  def testIGDN(self):
+    x = np.random.uniform(size=(1, 2, 3, 4))
+    y = self._runGDN(x, x.shape, True, 'channels_last')
+    self.assertEqual(x.shape, y.shape)
+    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+
+
 class MaxPool2DTest(test.TestCase):
 
   def testInvalidDataFormat(self):

From fc7d2c803f3eeca68978b54eb1540e705bc6c2b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 16:07:45 -0700
Subject: [PATCH 42/56] Use `to_float` and `to_int32` where appropriate.

PiperOrigin-RevId: 163138030
---
 tensorflow/python/training/input.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 93ba15ec815..19ecb85c60e 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -168,7 +168,7 @@ def input_producer(input_tensor,
             q, [enq], cancel_op=cancel_op))
     if summary_name is not None:
       summary.scalar(summary_name,
-                     math_ops.cast(q.size(), dtypes.float32) * (1. / capacity))
+                     math_ops.to_float(q.size()) * (1. / capacity))
     return q
 
 
@@ -643,7 +643,7 @@ def _shapes(tensor_list_list, shapes, enqueue_many):
 
 def _select_which_to_enqueue(tensor_list, keep_input):
   """Select which examples to enqueue based on vector `keep_input`."""
-  select_i = math_ops.cast(keep_input, dtypes.int32)
+  select_i = math_ops.to_int32(keep_input)
   tensor_list = [
       data_flow_ops.dynamic_partition(x, select_i, num_partitions=2)[1]
       for x in tensor_list]
@@ -707,8 +707,7 @@ def _batch(tensors, batch_size, keep_input, num_threads=1, capacity=32,
         capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
     summary.scalar("fraction_of_%d_full" % capacity,
-                   math_ops.cast(queue.size(), dtypes.float32) *
-                   (1. / capacity))
+                   math_ops.to_float(queue.size()) * (1. / capacity))
 
     if allow_smaller_final_batch:
       dequeued = queue.dequeue_up_to(batch_size, name=name)
@@ -742,8 +741,7 @@ def _batch_join(tensors_list, batch_size, keep_input, capacity=32,
         capacity=capacity, dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
     summary.scalar("fraction_of_%d_full" % capacity,
-                   math_ops.cast(queue.size(), dtypes.float32) *
-                   (1. / capacity))
+                   math_ops.to_float(queue.size()) * (1. / capacity))
 
     if allow_smaller_final_batch:
       dequeued = queue.dequeue_up_to(batch_size, name=name)
@@ -775,8 +773,8 @@ def _shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
         capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
         dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue(queue, tensor_list, num_threads, enqueue_many, keep_input)
-    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
-                          dtypes.float32) *
+    full = (math_ops.to_float(
+        math_ops.maximum(0, queue.size() - min_after_dequeue)) *
             (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
@@ -812,8 +810,8 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
         capacity=capacity, min_after_dequeue=min_after_dequeue, seed=seed,
         dtypes=types, shapes=shapes, shared_name=shared_name)
     _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input)
-    full = (math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue),
-                          dtypes.float32) *
+    full = (math_ops.to_float(
+        math_ops.maximum(0, queue.size() - min_after_dequeue)) *
             (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.

From fe4b88c95ff511205e2b8d724809683b7828edf8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 16:43:59 -0700
Subject: [PATCH 43/56] Call AddDefaultAttrsToGraphDef() in
 grappler_item_builder.cc with op_registry including function library so that
 it doesn't fail for the graph with inline function library.

PiperOrigin-RevId: 163142677
---
 tensorflow/core/grappler/grappler_item_builder.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 4518a7a78e4..0c2801e8bc3 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -82,10 +82,6 @@ Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def,
 
   // Inline all functions.
   GraphDef inlined_graph_def(graph_def);
-  // Populate default attrs to the NodeDefs in the GraphDef, which is required
-  // by inlining code.
-  TF_RETURN_IF_ERROR(
-      AddDefaultAttrsToGraphDef(&inlined_graph_def, *OpRegistry::Global(), 0));
 
   for (int i = 0; i < inlined_graph_def.library().function().size(); i++) {
     FunctionDef* fdef =
@@ -122,6 +118,10 @@ Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def,
   graph_ctor_opts.allow_internal_ops = true;
   graph_ctor_opts.expect_device_spec = false;
   std::unique_ptr<Graph> graphptr(new Graph(function_library));
+  // Populate default attrs to the NodeDefs in the GraphDef.
+  TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&inlined_graph_def,
+                                               *graphptr->op_registry(), 0));
+
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_ctor_opts, inlined_graph_def,
                                             graphptr.get()));
 

From 4ecc31ddd04f13c4d3bde48c28ff0646aa477bca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 17:26:15 -0700
Subject: [PATCH 44/56] `.get_shape()` -> `.shape` in input.py.

PiperOrigin-RevId: 163147646
---
 tensorflow/python/training/input.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 19ecb85c60e..a602438e0ba 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -148,7 +148,7 @@ def input_producer(input_tensor,
   """
   with ops.name_scope(name, "input_producer", [input_tensor]):
     input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
-    element_shape = input_tensor.get_shape()[1:].merge_with(element_shape)
+    element_shape = input_tensor.shape[1:].merge_with(element_shape)
     if not element_shape.is_fully_defined():
       raise ValueError("Either `input_tensor` must have a fully defined shape "
                        "or `element_shape` must be specified")
@@ -465,7 +465,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
   def _sparse_meta_data(t, storing_op, map_op):
     if not isinstance(t, sparse_tensor.SparseTensor):
       return _SparseMetaData(False, None, None)
-    rank = t.dense_shape.get_shape().with_rank(1)[0]
+    rank = t.dense_shape.shape.with_rank(1)[0]
     if enqueue_many:
       rank -= 1
     # If a shared map_op was provided, use that. Otherwise use the name of
@@ -492,7 +492,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
           lambda: -1 * array_ops.ones(array_ops.shape(t)[0:1], dtypes.int64))
       out_tensor.set_shape([None])  # necessary when t.ndims is unknown
       return out_tensor
-    if keep_input.get_shape().ndims == 1:
+    if keep_input.shape.ndims == 1:
       t = sparse_ops.sparse_retain(t, keep_input)
       store_f = lambda t, name, _: _store_many_sparse(t, shared_name=name)
     elif enqueue_many:
@@ -577,13 +577,13 @@ def _validate_join(tensor_list_list):
 def _validate_keep_input(keep_input, enqueue_many):
   """Validate `keep_input` argument to conditional batching functions."""
   keep_input = ops.convert_to_tensor(keep_input)
-  if keep_input.get_shape().ndims is None:
+  if keep_input.shape.ndims is None:
     raise ValueError(
         "`keep_input` dimensions must be known at graph construction.")
-  if not enqueue_many and keep_input.get_shape().ndims == 1:
+  if not enqueue_many and keep_input.shape.ndims == 1:
     raise ValueError(
         "`keep_input` cannot be a vector when `enqueue_many=False`.")
-  if keep_input.get_shape().ndims > 1:
+  if keep_input.shape.ndims > 1:
     raise ValueError("`keep_input` must be 0 or 1 dimensions.")
   return keep_input
 
@@ -632,11 +632,11 @@ def _shapes(tensor_list_list, shapes, enqueue_many):
 
     for tl in tensor_list_list:
       for i in xrange(len0):
-        if tl[i].get_shape().ndims is None:
+        if tl[i].shape.ndims is None:
           raise ValueError("Cannot infer Tensor's rank: %s" % tl[i])
 
     shapes = [_merge_shapes(
-        [tl[i].get_shape().as_list() for tl in tensor_list_list], enqueue_many)
+        [tl[i].shape.as_list() for tl in tensor_list_list], enqueue_many)
               for i in xrange(len0)]
   return shapes
 
@@ -656,7 +656,7 @@ def _enqueue_join(queue, tensor_list_list, enqueue_many, keep_input):
     enqueue_fn = queue.enqueue_many
   else:
     enqueue_fn = queue.enqueue
-  if keep_input.get_shape().ndims == 1:
+  if keep_input.shape.ndims == 1:
     enqueue_ops = [enqueue_fn(_select_which_to_enqueue(x, keep_input))
                    for x in tensor_list_list]
   else:
@@ -673,7 +673,7 @@ def _enqueue(queue, tensor_list, threads, enqueue_many, keep_input):
     enqueue_fn = queue.enqueue_many
   else:
     enqueue_fn = queue.enqueue
-  if keep_input.get_shape().ndims == 1:
+  if keep_input.shape.ndims == 1:
     enqueue_ops = [
         enqueue_fn(_select_which_to_enqueue(tensor_list, keep_input))] * threads
   else:

From 42eba6a7046d232de397ea2d0e627e91f095bc4c Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Tue, 25 Jul 2017 17:48:52 -0700
Subject: [PATCH 45/56] Implemented MemoryUsed function for hash tables.

PiperOrigin-RevId: 163149969
---
 tensorflow/core/kernels/lookup_table_op.cc | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index b33b98975d3..d721b3d5427 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -124,6 +124,20 @@ class MutableHashTableOfScalars final : public LookupInterface {
 
   TensorShape value_shape() const override { return TensorShape(); }
 
+  int64 MemoryUsed() const override {
+    int64 ret = 0;
+    mutex_lock l(mu_);
+    for (unsigned i = 0; i < table_.bucket_count(); ++i) {
+      size_t bucket_size = table_.bucket_size(i);
+      if (bucket_size == 0) {
+        ret++;
+      } else {
+        ret += bucket_size;
+      }
+    }
+    return sizeof(MutableHashTableOfScalars) + ret;
+  }
+
  private:
   // TODO(andreasst): consider using a read/write lock or a concurrent map
   mutable mutex mu_;
@@ -239,6 +253,20 @@ class MutableHashTableOfTensors final : public LookupInterface {
 
   TensorShape value_shape() const override { return value_shape_; }
 
+  int64 MemoryUsed() const override {
+    int64 ret = 0;
+    mutex_lock l(mu_);
+    for (unsigned i = 0; i < table_.bucket_count(); ++i) {
+      size_t bucket_size = table_.bucket_size(i);
+      if (bucket_size == 0) {
+        ret++;
+      } else {
+        ret += bucket_size;
+      }
+    }
+    return sizeof(MutableHashTableOfTensors) + ret;
+  }
+
  private:
   TensorShape value_shape_;
   // TODO(andreasst): consider using a read/write lock or a concurrent map
@@ -467,6 +495,12 @@ class MutableDenseHashTable final : public LookupInterface {
 
   TensorShape value_shape() const override { return value_shape_; }
 
+  int64 MemoryUsed() const override {
+    mutex_lock l(mu_);
+    return sizeof(MutableDenseHashTable) + key_buckets_.AllocatedBytes() +
+           value_buckets_.AllocatedBytes() + empty_key_.AllocatedBytes();
+  }
+
  private:
   Status DoInsert(OpKernelContext* ctx, const Tensor& key, const Tensor& value,
                   bool ignore_empty_key) EXCLUSIVE_LOCKS_REQUIRED(mu_) {

From 689cbda96444511bd37a01b125791c45a093bec3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 19:17:40 -0700
Subject: [PATCH 46/56] Plumbs rendezvous through function calls.

PiperOrigin-RevId: 163157033
---
 tensorflow/core/common_runtime/function.cc    | 13 ++-
 .../core/common_runtime/memory_types.cc       | 33 +++++---
 tensorflow/core/framework/function.h          |  6 +-
 tensorflow/core/graph/graph_partition.cc      | 41 +++++++++-
 tensorflow/core/kernels/function_ops.cc       |  2 +
 tensorflow/core/kernels/sendrecv_ops.cc       | 29 ++++++-
 tensorflow/core/kernels/sendrecv_ops.h        |  2 +
 tensorflow/python/framework/function_test.py  | 81 +++++++++++++------
 8 files changed, 156 insertions(+), 51 deletions(-)

diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 94b9d33c0cf..c9a4b476813 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -260,6 +260,8 @@ class CallOp : public AsyncOpKernel {
                       done);
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
+    opts.rendezvous = ctx->rendezvous();
+    opts.cancellation_manager = ctx->cancellation_manager();
     opts.step_container = ctx->step_container();
     opts.stats_collector = ctx->stats_collector();
     opts.runner = ctx->runner();
@@ -545,23 +547,18 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   Executor::Args exec_args;
   // Inherit the step_id from the caller.
   exec_args.step_id = opts.step_id;
-  exec_args.step_container = opts.step_container;
-
+  exec_args.rendezvous = opts.rendezvous;
   exec_args.stats_collector = opts.stats_collector;
   exec_args.call_frame = frame;
   exec_args.cancellation_manager = opts.cancellation_manager;
+  exec_args.step_container = opts.step_container;
   exec_args.runner = *opts.runner;
-  // TODO(zhifengc): we can avoid creating rendez here if we know
-  // there is no send/recv nodes in the graph.
-  auto* rendez = new IntraProcessRendezvous(device_mgr_);
-  exec_args.rendezvous = rendez;
   item->exec->RunAsync(
       // Executor args
       exec_args,
       // Done callback.
-      [item, frame, rets, rendez, done](const Status& status) {
+      [item, frame, rets, done](const Status& status) {
         item->Unref();
-        rendez->Unref();
         Status s = status;
         if (s.ok()) {
           s = frame->GetRetvals(rets);
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 21ed73df77d..76b926ba400 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -104,10 +104,21 @@ Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
       });
 }
 
-static Node* Send(Graph* g, const string& device_name, bool host,
-                  const Edge* edge) {
-  const string tensor_name =
-      strings::StrCat("edge_", edge->id(), "_", edge->src()->name());
+// Given an Edge whose two endpoints have different memory types and
+// are gonna to insert a pair of HostSend/Recv or Send/HostRecv nodes,
+// GetTensorName() returns a unique string that we can use as part of
+// the rendezvous key. The return string is guaranteed to be unique
+// within this process. That is sufficient because EnsureMemoryTypes
+// is only used on a TensorFlow graph that is gonna to be executed in
+// a single tf device (hence within a single process).
+static string GetTensorName(const Edge* edge) {
+  static std::atomic<int64> counter(0);
+  return strings::StrCat("memtype_", counter.fetch_add(1), "_",
+                         edge->src()->name());
+}
+
+static Node* Send(Graph* g, const string& tensor_name,
+                  const string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), host ? "_HostSend" : "_Send")
                   .Input(edge->src(), edge->src_output())
@@ -115,14 +126,13 @@ static Node* Send(Graph* g, const string& device_name, bool host,
                   .Attr("send_device", device_name)
                   .Attr("send_device_incarnation", 0)  // Do not care.
                   .Attr("recv_device", device_name)
+                  .Attr("_hostmem_sendrecv", true)
                   .Finalize(g, &ret));
   return ret;
 }
 
-static Node* Recv(Graph* g, const string& device_name, bool host,
-                  const Edge* edge) {
-  const string tensor_name =
-      strings::StrCat("edge_", edge->id(), "_", edge->src()->name());
+static Node* Recv(Graph* g, const string& tensor_name,
+                  const string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(
       NodeBuilder(g->NewName("n"), host ? "_HostRecv" : "_Recv")
@@ -131,6 +141,7 @@ static Node* Recv(Graph* g, const string& device_name, bool host,
           .Attr("send_device", device_name)
           .Attr("send_device_incarnation", 0)
           .Attr("recv_device", device_name)
+          .Attr("_hostmem_sendrecv", true)
           .Finalize(g, &ret));
   return ret;
 }
@@ -171,8 +182,10 @@ Status EnsureMemoryTypes(const DeviceType& device_type,
       Endpoint key{e->src()->id(), e->src_output()};
       auto iter = recv_nodes.find(key);
       if (iter == recv_nodes.end()) {
-        Node* send = Send(g, device_name, (item.sm == HOST_MEMORY), e);
-        recv = Recv(g, device_name, (item.dm == HOST_MEMORY), e);
+        const string tensor_name = GetTensorName(e);
+        Node* send =
+            Send(g, tensor_name, device_name, (item.sm == HOST_MEMORY), e);
+        recv = Recv(g, tensor_name, device_name, (item.dm == HOST_MEMORY), e);
         if (!has_ref) {
           // We only cache if there is no ref is involved.
           recv_nodes[key] = recv;
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index aeb924a709c..d840d2f001d 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -37,6 +37,7 @@ class CancellationManager;
 class GraphDef;
 class OpKernel;
 class ResourceMgr;
+class Rendezvous;
 class ScopedStepContainer;
 class StepStatsCollector;
 class Node;
@@ -398,11 +399,10 @@ class FunctionLibraryRuntime {
   //
   // Does not take ownership of "rets".
   struct Options {
-    CancellationManager* cancellation_manager = nullptr;
     // The id of the step that is calling this function.
     int64 step_id = 0;
-
-    // Per-step container.
+    Rendezvous* rendezvous = nullptr;
+    CancellationManager* cancellation_manager = nullptr;
     ScopedStepContainer* step_container = nullptr;
     StepStatsCollector* stats_collector = nullptr;
 
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 8ef2231aa2e..750e18a9ca0 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -896,6 +896,36 @@ Status AddControlEdges(const PartitionOptions& opts,
   return Status::OK();
 }
 
+// If 'ndef' is a Send or Recv, fills its attr send_device_incarnation
+// if possible.
+void SetIncarnation(const PartitionOptions& opts, NodeDef* ndef) {
+  StringPiece op(ndef->op());
+  if (op != "_Send" && op != "_Recv") {
+    // Not related to send/recv.
+    return;
+  }
+  string send_device;
+  if (!GetNodeAttr(*ndef, "send_device", &send_device).ok()) {
+    // No known send_device. The runtime will detect it later.
+    return;
+  }
+  int64 incarnation = opts.get_incarnation(send_device);
+  AddNodeAttr("send_device_incarnation", incarnation, ndef);
+}
+
+// Sets attribute send_device_incarnation of all Send/Recv nodes in
+// 'gdef', if possible.
+void SetIncarnation(const PartitionOptions& opts, GraphDef* gdef) {
+  for (NodeDef& ndef : *gdef->mutable_node()) {
+    SetIncarnation(opts, &ndef);
+  }
+  for (FunctionDef& fdef : *gdef->mutable_library()->mutable_function()) {
+    for (NodeDef& ndef : *fdef.mutable_node_def()) {
+      SetIncarnation(opts, &ndef);
+    }
+  }
+}
+
 Status Partition(const PartitionOptions& opts, Graph* g,
                  std::unordered_map<string, GraphDef>* partitions) {
   Status status;
@@ -1130,10 +1160,15 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     }
   }
 
-  // Set versions and function library
+  // Set versions, function library and send/recv incarnation.
   for (auto& it : *partitions) {
-    it.second.mutable_versions()->CopyFrom(g->versions());
-    *it.second.mutable_library() = g->flib_def().ToProto();
+    GraphDef* gdef = &it.second;
+    *gdef->mutable_versions() = g->versions();
+    *gdef->mutable_library() = g->flib_def().ToProto();
+
+    // Traverse the graph to fill every send/recv op's incarnation
+    // information.
+    SetIncarnation(opts, gdef);
   }
 
   // Set the start times for recvs at the very end.
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 58c4ed37c4f..71f1b4e063a 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -237,6 +237,8 @@ class SymbolicGradientOp : public AsyncOpKernel {
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
+    opts.rendezvous = ctx->rendezvous();
+    opts.cancellation_manager = ctx->cancellation_manager();
     opts.runner = ctx->runner();
     std::vector<Tensor> args;
     args.reserve(ctx->num_inputs());
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 2a98a6530cf..9c242052f7c 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -39,6 +39,19 @@ static void GetRendezvousKey(const string& key_prefix,
                      frame_iter.iter_id);
 }
 
+static FrameAndIter GetFrameAndIter(OpKernelContext* ctx,
+                                    bool hostmem_sendrecv) {
+  if (hostmem_sendrecv && ctx->call_frame() != nullptr) {
+    // Host memory send/recv pairs are added by
+    // common_runtime/memory_types.cc.  When the pair of nodes are
+    // added inside a function, we need to use the function call frame
+    // to formulate the unique rendezvous key.
+    return FrameAndIter(reinterpret_cast<uint64>(ctx->call_frame()), 0);
+  } else {
+    return ctx->frame_iter();
+  }
+}
+
 SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   string send_device;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device));
@@ -56,6 +69,9 @@ SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   // proactively cache the rendezvous key for the top-level.
   GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
   OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
 }
 
 void SendOp::Compute(OpKernelContext* ctx) {
@@ -71,7 +87,8 @@ void SendOp::Compute(OpKernelContext* ctx) {
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = ctx->input_alloc_attr(0);
 
-  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  if (frame_iter == FrameAndIter(0, 0)) {
     // Use the cached rendezvous key.
     VLOG(2) << "Send " << parsed_key_.buf_;
     OP_REQUIRES_OK(ctx,
@@ -79,7 +96,7 @@ void SendOp::Compute(OpKernelContext* ctx) {
                                            ctx->is_input_dead()));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
-    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
     VLOG(2) << "Send " << in_loop_parsed.buf_;
     OP_REQUIRES_OK(ctx,
                    Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
@@ -120,6 +137,9 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   // proactively cache the rendezvous key for the top-level.
   GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
   OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
+  if (!ctx->GetAttr("_hostmem_sendrecv", &hostmem_sendrecv_).ok()) {
+    hostmem_sendrecv_ = false;
+  }
 }
 
 void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
@@ -151,12 +171,13 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
       },
       std::move(done), _1, _2, _3, _4, _5);
 
-  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+  FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
+  if (frame_iter == FrameAndIter(0, 0)) {
     VLOG(2) << "Recv " << parsed_key_.buf_;
     ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
-    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
     VLOG(2) << "Recv " << in_loop_parsed.buf_;
     OP_REQUIRES_OK_ASYNC(
         ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 67867e33086..1ff8eff13f7 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -29,6 +29,7 @@ class SendOp : public OpKernel {
  private:
   string key_prefix_;
   Rendezvous::ParsedKey parsed_key_;
+  bool hostmem_sendrecv_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
 };
@@ -41,6 +42,7 @@ class RecvOp : public AsyncOpKernel {
  private:
   string key_prefix_;
   Rendezvous::ParsedKey parsed_key_;
+  bool hostmem_sendrecv_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
 };
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index c4e841b81f5..abfd0b76606 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -325,6 +325,25 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  def testWhileLoopCallsFunc(self):
+    with self.test_session(use_gpu=True) as sess:
+
+      @function.Defun(dtypes.float32)
+      def Times2(x):
+        constant_two = constant_op.constant(2, dtypes.int32)
+        two_on_gpu = math_ops.cast(constant_two, dtypes.float32)
+        return x * two_on_gpu
+
+      def Body(x):
+        x2 = Times2(x)
+        x2.set_shape([])
+        return x2
+
+      loop = control_flow_ops.while_loop(lambda x: x < 1e5, Body, [1.0])
+
+      ans = sess.run(loop)
+      self.assertAllClose(ans, 131072.)
+
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -588,8 +607,8 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(vals[2], vals[3])
 
   def testDeclare(self):
-    foo = function.Declare("Foo", [("x", dtypes.float32)],
-                           [("y", dtypes.float32)])
+    foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
+                                                             dtypes.float32)])
 
     @function.Defun(dtypes.float32, func_name="Foo", out_names=["y"])
     def FooImpl(x):
@@ -607,8 +626,8 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
 
   def testDeclareUsedInDefun(self):
-    foo = function.Declare("Foo", [("x", dtypes.float32)],
-                           [("y", dtypes.float32)])
+    foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
+                                                             dtypes.float32)])
 
     @function.Defun()
     def Bar(x):
@@ -630,8 +649,8 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(expected, y.eval(feed_dict={x: rand}))
 
   def testDeclareTypeMistake(self):
-    foo = function.Declare("Foo", [("x", dtypes.float32)],
-                           [("y", dtypes.float32)])
+    foo = function.Declare("Foo", [("x", dtypes.float32)], [("y",
+                                                             dtypes.float32)])
 
     @function.Defun(dtypes.float32, func_name="Foo", out_names=["y"])
     def Foo(x):
@@ -749,8 +768,9 @@ class FunctionTest(test.TestCase):
       self.assertAllEqual(v1, 20.)
 
   def testShapeFunction(self):
-    @function.Defun(dtypes.float32,
-                    shape_func=lambda op: [op.inputs[0].get_shape()])
+
+    @function.Defun(
+        dtypes.float32, shape_func=lambda op: [op.inputs[0].get_shape()])
     def Foo(x):
       return x + 1.0
 
@@ -767,11 +787,12 @@ class FunctionTest(test.TestCase):
       self.assertAllEqual(y.get_shape().as_list(), [1, 1, 2, 3])
 
   def testVariableReuse(self):
+
     def LinearWithReuse(input_tensor, reuse=None):
       size = input_tensor.shape.dims[1]
       with variable_scope.variable_scope("linear", reuse=reuse):
-        w = variable_scope.get_variable("w", shape=[size, size],
-                                        dtype=input_tensor.dtype)
+        w = variable_scope.get_variable(
+            "w", shape=[size, size], dtype=input_tensor.dtype)
       return math_ops.matmul(input_tensor, w)
 
     @function.Defun(dtypes.float32)
@@ -789,15 +810,19 @@ class FunctionTest(test.TestCase):
 
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      output_val = sess.run(output_op,
-                            feed_dict={input_op: np.random.rand(32, 100)})
+      output_val = sess.run(
+          output_op, feed_dict={input_op: np.random.rand(32, 100)})
       self.assertEqual(output_val.shape, (32, 100))
 
   def testFunctionCallInDifferentVariableScopes(self):
+
     @function.Defun(dtypes.float32)
     def Foo(inputs):
-      var = variable_scope.get_variable("var", shape=[10], dtype=dtypes.float32,
-                                        initializer=init_ops.ones_initializer())
+      var = variable_scope.get_variable(
+          "var",
+          shape=[10],
+          dtype=dtypes.float32,
+          initializer=init_ops.ones_initializer())
       return inputs + var
 
     input_op = array_ops.placeholder(shape=[10], dtype=dtypes.float32)
@@ -813,8 +838,8 @@ class FunctionTest(test.TestCase):
 
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      out1, out2 = sess.run([out1_op, out2_op],
-                            feed_dict={input_op: np.linspace(1, 10, 10)})
+      out1, out2 = sess.run(
+          [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)})
       self.assertAllEqual(out1, np.linspace(2, 11, 10))
       self.assertAllEqual(out2, np.linspace(2, 11, 10))
 
@@ -852,12 +877,15 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(func.captured_inputs, new_func.captured_inputs)
 
   def testBasic(self):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def Foo(x, y):
       return x + y
+
     self.expectFunctionsEqual(Foo)
 
   def testGradFunc(self):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def G(x, dy):
       return x * dy
@@ -865,10 +893,12 @@ class FunctionsFromProtos(test.TestCase):
     @function.Defun(dtypes.float32, grad_func=G)
     def F(x):
       return math_ops.exp(x) - math_ops.exp(-x)
+
     self.expectFunctionsEqual(F, grad_func=G)
 
   def testCapturedInputs(self):
     c = constant_op.constant(10, dtypes.int64)
+
     @function.Defun(dtypes.int64)
     def Foo(x):
       return x + c
@@ -885,6 +915,7 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(len(new_func.captured_inputs), 0)
 
   def testNestedFunctions(self):
+
     @function.Defun(dtypes.float32)
     def Outer(x):
 
@@ -958,6 +989,7 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(len(function._from_library(library)), 0)
 
   def testFromLibraryMissingFuncDef(self):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def G1(x, dy):
       return x * dy
@@ -989,6 +1021,7 @@ class FunctionsFromProtos(test.TestCase):
       function._from_library(library)
 
   def testFromLibraryCyclicGradFuncs(self):
+
     @function.Defun(dtypes.float32)
     def F1(x):
       return math_ops.exp(x) - math_ops.exp(-x)
@@ -1242,10 +1275,11 @@ class FunctionInlineControlTest(test.TestCase):
       inp = np.random.uniform(-1, 1, [16, 1]).astype(np.float32)
       run_metadata = config_pb2.RunMetadata()
       with session.Session(graph=g, config=cfg) as sess:
-        ans = sess.run([y, dx], {x: inp},
-                       run_metadata=run_metadata,
-                       options=config_pb2.RunOptions(
-                           trace_level=config_pb2.RunOptions.FULL_TRACE))
+        ans = sess.run(
+            [y, dx], {x: inp},
+            run_metadata=run_metadata,
+            options=config_pb2.RunOptions(
+                trace_level=config_pb2.RunOptions.FULL_TRACE))
         print(ans[0], np.sum(ans[1]))
         self.assertAllClose(ans[0], 255.971, rtol=1e-3)
         self.assertAllClose(np.sum(ans[1]), 13.0408, rtol=1e-3)
@@ -1275,8 +1309,7 @@ class ModuleFunctionTest(test.TestCase):
   def testBasic(self):
     with ops.Graph().as_default():
       a, b, c, d, e = [
-          constant_op.constant(
-              [[_]], dtype=dtypes.float32) for _ in range(5)
+          constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5)
       ]
       y = Linear(a, b, c)
       z = Linear2(a, b, c, d, e)
@@ -1295,7 +1328,8 @@ class VariableHoistingTest(test.TestCase):
           initializer=init_ops.random_uniform_initializer(seed=312),
           use_resource=use_resource)
       b = variable_scope.get_variable(
-          "b", (64), initializer=init_ops.zeros_initializer(),
+          "b", (64),
+          initializer=init_ops.zeros_initializer(),
           use_resource=use_resource),
       return math_ops.sigmoid(math_ops.matmul(x, w) + b)
 
@@ -1354,5 +1388,6 @@ class VariableHoistingTest(test.TestCase):
     self._testSimpleModel(True, use_resource=True)
     self._testSimpleModel(False, use_resource=True)
 
+
 if __name__ == "__main__":
   test.main()

From 8b3cbeb507f2f38e7785fb0625f8b0ee9992d72a Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Tue, 25 Jul 2017 19:23:01 -0700
Subject: [PATCH 47/56] Automated g4 rollback of changelist 163128673

PiperOrigin-RevId: 163157277
---
 tensorflow/core/platform/env.cc      | 91 ----------------------------
 tensorflow/core/platform/env.h       |  6 --
 tensorflow/core/platform/env_test.cc | 28 ---------
 3 files changed, 125 deletions(-)

diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 568a22b295c..2fdd989c9b9 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <sys/stat.h>
 #include <deque>
 #include <utility>
 #include <vector>
@@ -31,10 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/env_time.h"
-#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -277,93 +273,6 @@ string Env::GetExecutablePath() {
   return exe_path;
 }
 
-bool Env::LocalTempFilename(string* filename) {
-  std::vector<string> dirs;
-  GetLocalTempDirectories(&dirs);
-
-  // Try each directory, as they might be full, have inappropriate
-  // permissions or have different problems at times.
-  for (const string& dir : dirs) {
-#ifdef __APPLE__
-    uint64_t tid64;
-    pthread_threadid_np(nullptr, &tid64);
-    int32 tid = static_cast<int32>(tid64);
-    int32 pid = static_cast<int32>(getpid());
-#elif defined(PLATFORM_WINDOWS)
-    int32 tid = static_cast<int32>(GetCurrentThreadId());
-    int32 pid = static_cast<int32>(GetCurrentProcessId());
-#else
-    int32 tid = static_cast<int32>(pthread_self());
-    int32 pid = static_cast<int32>(getpid());
-#endif
-    uint64 now_microsec = NowMicros();
-
-    *filename = io::JoinPath(
-        dir, strings::Printf("tempfile-%s-%x-%d-%llx", port::Hostname().c_str(),
-                             tid, pid, now_microsec));
-    if (FileExists(*filename).ok()) {
-      filename->clear();
-    } else {
-      return true;
-    }
-  }
-  return false;
-}
-
-void Env::GetLocalTempDirectories(std::vector<string>* list) {
-  list->clear();
-#ifdef PLATFORM_WINDOWS
-  // On windows we'll try to find a directory in this order:
-  //   C:/Documents & Settings/whomever/TEMP (or whatever GetTempPath() is)
-  //   C:/TMP/
-  //   C:/TEMP/
-  //   C:/WINDOWS/ or C:/WINNT/
-  //   .
-  char tmp[MAX_PATH];
-  // GetTempPath can fail with either 0 or with a space requirement > bufsize.
-  // See http://msdn.microsoft.com/en-us/library/aa364992(v=vs.85).aspx
-  DWORD n = GetTempPathA(MAX_PATH, tmp);
-  if (n > 0 && n <= MAX_PATH) list->push_back(tmp);
-  list->push_back("C:\\tmp\\");
-  list->push_back("C:\\temp\\");
-#else
-  // Directories, in order of preference. If we find a dir that
-  // exists, we stop adding other less-preferred dirs
-  const char* candidates[] = {
-      // Non-null only during unittest/regtest
-      getenv("TEST_TMPDIR"),
-
-      // Explicitly-supplied temp dirs
-      getenv("TMPDIR"),
-      getenv("TMP"),
-
-      // The old classic tmpdir
-      "/export/hda3/tmp",
-
-      // If all else fails
-      "/tmp",
-  };
-
-  for (const char* d : candidates) {
-    if (!d || d[0] == '\0') continue;  // Empty env var
-
-    // Make sure we don't surprise anyone who's expecting a '/'
-    string dstr = d;
-    if (dstr[dstr.size() - 1] != '/') {
-      dstr += "/";
-    }
-
-    struct stat statbuf;
-    if (!stat(d, &statbuf) && S_ISDIR(statbuf.st_mode) &&
-        !access(dstr.c_str(), 0)) {
-      // We found a dir that exists and is accessible - we're done.
-      list->push_back(dstr);
-      return;
-    }
-  }
-#endif
-}
-
 Thread::~Thread() {}
 
 EnvWrapper::~EnvWrapper() {}
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index da8c3e2d7e8..1b7e024b0f4 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -215,9 +215,6 @@ class Env {
   /// symlinks if there is any.
   string GetExecutablePath();
 
-  /// Creates a local unique temporary file name. Returns true if success.
-  bool LocalTempFilename(string* filename);
-
   // TODO(jeff,sanjay): Add back thread/thread-pool support if needed.
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
@@ -282,9 +279,6 @@ class Env {
       const string& version) = 0;
 
  private:
-  // Returns a possible list of local temporary directories.
-  void GetLocalTempDirectories(std::vector<string>* list);
-
   std::unique_ptr<FileSystemRegistry> file_system_registry_;
   TF_DISALLOW_COPY_AND_ASSIGN(Env);
   EnvTime* envTime = EnvTime::Default();
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 50dd0cd58b8..7bc1882c86d 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -298,32 +298,4 @@ TEST_F(DefaultEnvTest, GetExecutablePath) {
   TF_EXPECT_OK(env->FileExists(env->GetExecutablePath()));
 }
 
-TEST_F(DefaultEnvTest, LocalTempFilename) {
-  Env* env = Env::Default();
-  string filename;
-  EXPECT_TRUE(env->LocalTempFilename(&filename));
-  EXPECT_FALSE(env->FileExists(filename).ok());
-
-  // Write something to the temporary file.
-  std::unique_ptr<WritableFile> file_to_write;
-  TF_CHECK_OK(env->NewWritableFile(filename, &file_to_write));
-  TF_CHECK_OK(file_to_write->Append("Null"));
-  TF_CHECK_OK(file_to_write->Close());
-  TF_CHECK_OK(env->FileExists(filename));
-
-  // Read from the temporary file and check content.
-  std::unique_ptr<RandomAccessFile> file_to_read;
-  TF_CHECK_OK(env->NewRandomAccessFile(filename, &file_to_read));
-  StringPiece content;
-  char scratch[1024];
-  CHECK_EQ(error::OUT_OF_RANGE,
-           file_to_read->Read(0 /* offset */, 1024 /* n */, &content, scratch)
-               .code());
-  EXPECT_EQ("Null", content.ToString());
-
-  // Delete the temporary file.
-  TF_CHECK_OK(env->DeleteFile(filename));
-  EXPECT_FALSE(env->FileExists(filename).ok());
-}
-
 }  // namespace tensorflow

From 19290f0567366bcdd3eb7adfe0f830a053cc1314 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 Jul 2017 20:27:22 -0700
Subject: [PATCH 48/56] Internal change.

PiperOrigin-RevId: 163160761
---
 tensorflow/compiler/xla/client/client_library.cc | 8 ++++++++
 tensorflow/compiler/xla/client/client_library.h  | 5 +++++
 2 files changed, 13 insertions(+)

diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 801596c0add..b1663bc8157 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -149,4 +149,12 @@ ClientLibrary::GetOrCreateCompileOnlyClient(
   return cl;
 }
 
+/* static */ void ClientLibrary::DestroyLocalInstances() {
+  ClientLibrary& client_library = Singleton();
+  tensorflow::mutex_lock lock(client_library.service_mutex_);
+
+  client_library.local_instances_.clear();
+  client_library.compile_only_instances_.clear();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index cff2b5124e8..a6f30d82e43 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -93,6 +93,11 @@ class ClientLibrary {
   static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
       perftools::gputools::Platform* platform = nullptr);
 
+  // Clears the local instance and compile only instance caches. The client
+  // pointers returned by the previous GetOrCreateLocalClient() or
+  // GetOrCreateCompileOnlyClient() invocations are not valid anymore.
+  static void DestroyLocalInstances();
+
  private:
   // Returns the singleton instance of ClientLibrary.
   static ClientLibrary& Singleton();

From dbd0f2e2d46044c90e27290f4aa35ebccba2e557 Mon Sep 17 00:00:00 2001
From: Kay Zhu <kayzhu@google.com>
Date: Tue, 25 Jul 2017 21:35:55 -0700
Subject: [PATCH 49/56] [XLA] Use HloEvaluator for convolution in
 reference_util.

PiperOrigin-RevId: 163164566
---
 tensorflow/compiler/xla/BUILD             |   3 +
 tensorflow/compiler/xla/reference_util.cc | 241 +++++++---------------
 2 files changed, 78 insertions(+), 166 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index e0a03a78f1d..ba90b13b383 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -563,6 +563,9 @@ cc_library(
         ":xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_evaluator",
+        "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 7ef5c6d916f..c851c38ea4f 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -20,6 +20,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/math/math_util.h"
@@ -446,179 +449,85 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
     std::pair<int64, int64> kernel_stride, Padding padding,
     std::pair<int64, int64> lhs_dilation, std::pair<int64, int64> rhs_dilation,
     ConvolutionDimensionNumbers dnums) {
-  std::array<int64, 4> lhs_dimensions{{lhs.n1(), lhs.n2(), lhs.n3(), lhs.n4()}};
-  std::array<int64, 4> rhs_dimensions{{rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()}};
+  HloComputation::Builder b("ConvArray4DGeneralDimensionDilated");
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(lhs);
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(rhs);
 
-  const int64 ksy = kernel_stride.first;
-  const int64 ksx = kernel_stride.second;
-  const int64 dy = lhs_dilation.first;
-  const int64 dx = lhs_dilation.second;
-  const int64 dky = rhs_dilation.first;
-  const int64 dkx = rhs_dilation.second;
-  CHECK_GE(dky, 1);
-  CHECK_GE(dkx, 1);
-  CHECK_GE(dy, 1);
-  CHECK_GE(dx, 1);
-
-  // Get all dimension sizes in lhs and rhs based on the given convolution
-  // dimension configuration.
-  const int64 ix = window_util::DilatedBound(
-      lhs_dimensions[dnums.spatial_dimensions(1)], dx);
-  const int64 iy = window_util::DilatedBound(
-      lhs_dimensions[dnums.spatial_dimensions(0)], dy);
-  const int64 iz = lhs_dimensions[dnums.feature_dimension()];
-  const int64 samples = lhs_dimensions[dnums.batch_dimension()];
-  const int64 kx = window_util::DilatedBound(
-      rhs_dimensions[dnums.kernel_spatial_dimensions(1)], dkx);
-  const int64 ky = window_util::DilatedBound(
-      rhs_dimensions[dnums.kernel_spatial_dimensions(0)], dky);
-  const int64 oz = rhs_dimensions[dnums.kernel_output_feature_dimension()];
-  {
-    const int64 kiz = rhs_dimensions[dnums.kernel_input_feature_dimension()];
-    CHECK_EQ(kiz, iz);
+  std::array<int64, 2> ordered_kernel_strides;
+  std::array<int64, 2> ordered_input_dimensions;
+  std::array<int64, 2> ordered_kernel_dimensions;
+  if (dnums.kernel_spatial_dimensions(0) > dnums.kernel_spatial_dimensions(1)) {
+    ordered_kernel_strides[0] = kernel_stride.second;
+    ordered_kernel_strides[1] = kernel_stride.first;
+  } else {
+    ordered_kernel_strides[0] = kernel_stride.first;
+    ordered_kernel_strides[1] = kernel_stride.second;
   }
 
-  if (padding == Padding::kSame) {
-    // We reject same padding with kernel striding, since it's somewhat
-    // nonsensical. We can always follow up to implement this with the desired
-    // semantics if anybody actually uses it.
-    CHECK_EQ(1, ksy);
-    CHECK_EQ(1, ksx);
-  }
+  ordered_input_dimensions[0] =
+      lhs_literal->shape().dimensions(dnums.spatial_dimensions(0));
+  ordered_input_dimensions[1] =
+      lhs_literal->shape().dimensions(dnums.spatial_dimensions(1));
+  ordered_kernel_dimensions[0] =
+      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(0));
+  ordered_kernel_dimensions[1] =
+      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(1));
 
-  const int64 ox =
-      padding == Padding::kSame ? ix : window_util::StridedBound(ix, kx, ksx);
-  const int64 oy =
-      padding == Padding::kSame ? iy : window_util::StridedBound(iy, ky, ksy);
-  const int64 istartx =
-      padding == Padding::kValid ? 0 : kx % 2 == 0 ? -(kx / 2 - 1) : -kx / 2;
-  const int64 istarty =
-      padding == Padding::kValid ? 0 : ky % 2 == 0 ? -(ky / 2 - 1) : -ky / 2;
-  // Create the output result array and reset the values to 0.
-  std::array<int64, 4> result_dimensions;
-  result_dimensions[dnums.batch_dimension()] = samples;
-  result_dimensions[dnums.feature_dimension()] = oz;
-  result_dimensions[dnums.spatial_dimensions(0)] = oy;
-  result_dimensions[dnums.spatial_dimensions(1)] = ox;
+  std::vector<std::pair<int64, int64>> paddings =
+      MakePadding(ordered_input_dimensions, ordered_kernel_dimensions,
+                  ordered_kernel_strides, padding);
+  CHECK_EQ(paddings.size(), 2);
+
+  Window window;
+
+  WindowDimension dim;
+  dim.set_size(
+      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(0)));
+  dim.set_stride(kernel_stride.first);
+  dim.set_padding_low(paddings[0].first);
+  dim.set_padding_high(paddings[0].second);
+  dim.set_window_dilation(rhs_dilation.first);
+  dim.set_base_dilation(lhs_dilation.first);
+  *window.add_dimensions() = dim;
+
+  WindowDimension dim2;
+  dim2.set_size(
+      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(1)));
+  dim2.set_stride(kernel_stride.second);
+  dim2.set_padding_low(paddings[1].first);
+  dim2.set_padding_high(paddings[1].second);
+  dim2.set_window_dilation(rhs_dilation.second);
+  dim2.set_base_dilation(lhs_dilation.second);
+  *window.add_dimensions() = dim2;
+
+  const Shape& shape =
+      ShapeInference::InferConvolveShape(lhs_literal->shape(),
+                                         rhs_literal->shape(), window, dnums)
+          .ConsumeValueOrDie();
+
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+
+  HloEvaluator evaluator;
+  std::unique_ptr<Literal> result_literal =
+      evaluator.Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
+
+  CHECK_EQ(ShapeUtil::Rank(result_literal->shape()), 4);
   auto result =
-      MakeUnique<Array4D<float>>(result_dimensions[0], result_dimensions[1],
-                                 result_dimensions[2], result_dimensions[3]);
-  result->Fill(0.0);
+      MakeUnique<Array4D<float>>(result_literal->shape().dimensions(0),
+                                 result_literal->shape().dimensions(1),
+                                 result_literal->shape().dimensions(2),
+                                 result_literal->shape().dimensions(3));
 
-  const auto is_int32 = [](int64 x) {
-    return x >= std::numeric_limits<int32>::min() &&
-           x <= std::numeric_limits<int32>::max();
-  };
+  result->Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
+    *value = result_literal->Get<float>(indices);
+  });
 
-  // 64-bit idiv/mod are much more expensive x86-64 than 32-bit idiv/imod (at
-  // least on x86-64), so we avoid them where possible.
-  const auto fast_idiv64 = [&](int64 a, int64 b) {
-    if (is_int32(a) && is_int32(b)) {
-      return static_cast<int64>(static_cast<int32>(a) / static_cast<int32>(b));
-    }
-    return a / b;
-  };
-  const auto fast_imod64 = [&](int64 a, int64 b) {
-    if (is_int32(a) && is_int32(b)) {
-      return static_cast<int64>(static_cast<int32>(a) % static_cast<int32>(b));
-    }
-    return a % b;
-  };
-
-  // Lambda to access the lhs operand at the given 4D index.
-  const auto lhs_element = [&](int64 batch, int64 feature, int64 height,
-                               int64 width) {
-    if (fast_imod64(height, dy) != 0 || fast_imod64(width, dx) != 0) {
-      return 0.0f;
-    }
-
-    std::array<int64, 4> index;
-    index[dnums.batch_dimension()] = batch;
-    index[dnums.feature_dimension()] = feature;
-    index[dnums.spatial_dimensions(0)] = fast_idiv64(height, dy);
-    index[dnums.spatial_dimensions(1)] = fast_idiv64(width, dx);
-    return lhs(index[0], index[1], index[2], index[3]);
-  };
-
-  // Lambda to access the rhs operand at the given 4D index.  height_over_dky
-  // should be equal to height / dky, and width_over_dkx should be equal to
-  // width / dkx.  (This is an optimization to avoid doing divisions.)
-  const auto rhs_element =
-      [&](int64 kernel_output_feature, int64 kernel_input_feature, int64 height,
-          int64 width, int64 height_over_dky, int64 width_over_dkx) {
-        DCHECK_EQ(height % dky, 0);
-        DCHECK_EQ(width % dkx, 0);
-        DCHECK_EQ(height / dky, height_over_dky);
-        DCHECK_EQ(width / dkx, width_over_dkx);
-
-        std::array<int64, 4> index;
-        index[dnums.kernel_output_feature_dimension()] = kernel_output_feature;
-        index[dnums.kernel_input_feature_dimension()] = kernel_input_feature;
-        index[dnums.kernel_spatial_dimensions(0)] = height_over_dky;
-        index[dnums.kernel_spatial_dimensions(1)] = width_over_dkx;
-        return rhs(index[0], index[1], index[2], index[3]);
-      };
-
-  // Lambda to access the result data at the given 4D index.
-  const auto result_element = [&](int64 batch, int64 kernel_output_feature,
-                                  int64 height, int64 width) -> float& {
-    std::array<int64, 4> index;
-    index[dnums.batch_dimension()] = batch;
-    index[dnums.feature_dimension()] = kernel_output_feature;
-    index[dnums.spatial_dimensions(0)] = height;
-    index[dnums.spatial_dimensions(1)] = width;
-    return (*result)(index[0], index[1], index[2], index[3]);
-  };
-
-  for (int64 oyi = 0; oyi < oy; ++oyi) {
-    for (int64 oxi = 0; oxi < ox; ++oxi) {
-      for (int64 sample = 0; sample < samples; ++sample) {
-        for (int64 izi = 0; izi < iz; ++izi) {
-          for (int64 ozi = 0; ozi < oz; ++ozi) {
-            for (int64 kyi = 0, kyi_over_dky = 0; kyi < ky;
-                 kyi += dky, kyi_over_dky++) {
-              for (int64 kxi = 0, kxi_over_dkx = 0; kxi < kx;
-                   kxi += dkx, kxi_over_dkx++) {
-                int64 iyi = istarty + ksy * oyi + kyi;
-                int64 ixi = istartx + ksx * oxi + kxi;
-                float input = (iyi >= iy || ixi >= ix || iyi < 0 || ixi < 0)
-                                  ? 0.0
-                                  : lhs_element(sample, izi, iyi, ixi);
-                float gain =
-                    rhs_element(ozi, izi, kyi, kxi, kyi_over_dky, kxi_over_dkx);
-                float addend = input * gain;
-                result_element(sample, ozi, oyi, oxi) += addend;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  if (samples == 0 || kx == 0 || ky == 0 || ox == 0 || oy == 0 || oz == 0 ||
-      iz == 0) {
-    LOG(INFO) << "Output will be trivially empty because one of these "
-                 "dimensions is 0: samples: "
-              << samples << " kx: " << kx << " ky: " << ky << " ox: " << ox
-              << " oy: " << oy << " oz: " << oz << " iz: " << iz;
-    return result;
-  }
-  bool trivial = true;
-  auto check_trivial = [&trivial](tensorflow::gtl::ArraySlice<int64> indices,
-                                  float value) {
-    if (value != 0.0) {
-      trivial = false;
-    }
-  };
-  lhs.Each(check_trivial);
-  if (trivial) {
-    LOG(FATAL) << "LHS is all 0.0.";
-  }
-  trivial = true;
-  rhs.Each(check_trivial);
-  if (trivial) {
-    LOG(FATAL) << "RHS is all 0.0.";
-  }
   return result;
 }
 

From 0de74e448cb64409033ea169542ef4d1552bf36b Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Tue, 25 Jul 2017 21:39:11 -0700
Subject: [PATCH 50/56] Removing session reset since destroying the session
 object would delete its variables as well. Resetting session might
 unintentionally close other sessions in the same process.

PiperOrigin-RevId: 163164738
---
 tensorflow/core/grappler/clusters/BUILD       |   2 +
 .../core/grappler/clusters/single_machine.cc  |   8 +-
 .../grappler/clusters/single_machine_test.cc  | 123 ++++++++++++++++++
 3 files changed, 127 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 667023845cd..e7230b37543 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -114,7 +114,9 @@ cc_test(
     deps = [
         ":single_machine",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index a1531f1cfcf..3481b2b158d 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -73,8 +73,6 @@ SingleMachine::~SingleMachine() {
   // when we delete the session.
   thread_pool_.reset();
 
-  Reset(options_, {}).IgnoreError();
-
   CHECK(already_created);
   already_created = false;
 }
@@ -277,11 +275,9 @@ Status SingleMachine::ResetSession() {
     // Make sure the session is properly closed
     TF_RETURN_IF_ERROR(Shutdown());
 
-    // We need to Reset the session to ensure that all the variables are
-    // deleted. But first we need to delete the session since Reset()
-    // deletes some of the containers referenced by the session.
+    // Destroying the object deletes all its varibles as well. This is only true
+    // for DirectSession.
     session_.reset();
-    TF_RETURN_IF_ERROR(Reset(options_, {}));
   }
 
   LOG(INFO) << "Starting new session";
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index b73b084793e..d8660e11a28 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -24,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -349,6 +353,7 @@ TEST_F(SingleMachineTest, InitializationMemory) {
 }
 
 namespace {
+
 template <class T>
 inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
   AttrValue attr_value;
@@ -463,6 +468,124 @@ TEST_F(SingleMachineTest, PersistentMemory) {
   EXPECT_TRUE(found_hashtable);
 }
 
+#if defined(PLATFORM_GOOGLE)
+namespace {
+
+SessionOptions GetSessionOption(int num_cpu_cores, int num_gpus) {
+  SessionOptions options;
+  // Copied from single_machine.h
+  (*options.config.mutable_device_count())["CPU"] = 1;
+  if (num_gpus > 0) {
+    (*options.config.mutable_device_count())["GPU"] = num_gpus;
+  }
+  CHECK_GE(num_cpu_cores, 1);
+  options.config.set_intra_op_parallelism_threads(num_cpu_cores);
+  options.config.add_session_inter_op_thread_pool()->set_num_threads(
+      num_cpu_cores);
+  return options;
+}
+
+Status GetDeviceMemoryStats(
+    const SessionOptions& session_option,
+    std::unordered_map<string, AllocatorStats>* allocator_stats_by_device) {
+  std::vector<Device*> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(session_option,
+                                               "" /* name_prefix */, &devices));
+  allocator_stats_by_device->clear();
+  for (Device* device : devices) {
+    AllocatorStats stats;
+    auto* allocator = device->GetAllocator(AllocatorAttributes());
+    if (!allocator->TracksAllocationSizes()) {
+      return Status(error::INVALID_ARGUMENT,
+                    "Tracking allocation is not enabled.");
+    }
+    allocator->GetStats(&stats);
+    (*allocator_stats_by_device)[device->name()] = stats;
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // Add a variable and initializer.
+  Output a = ops::Variable(s.WithOpName("a"), TensorShape({128, 256}),
+                           DataType::DT_FLOAT);
+  Output a_init =
+      ops::RandomNormal(s.WithOpName("a/init"), {128, 256}, DataType::DT_FLOAT);
+  Output a_init_assign = ops::Assign(s.WithOpName("a/init/assign"), a, a_init);
+
+  // Add a resource variable.
+  Output b =
+      ops::VarHandleOp(s.WithOpName("b"), DataType::DT_FLOAT, {256, 512});
+  Output b_read =
+      ops::ReadVariableOp(s.WithOpName("b/read"), b, DataType::DT_FLOAT);
+  Output b_init =
+      ops::RandomNormal(s.WithOpName("b/init"), {256, 512}, DataType::DT_FLOAT);
+  auto b_init_assign =
+      ops::AssignVariableOp(s.WithOpName("b/init/assign"), b, b_init);
+
+  // Add a queue.
+  ops::FIFOQueue queue(s.WithOpName("queue"), {DataType::DT_STRING});
+  Output some_string =
+      ops::Const(s.WithOpName("some_string"), string("nothing"));
+  ops::QueueEnqueue enqueue(s.WithOpName("enqueue"), queue, {some_string});
+  ops::QueueDequeue dequeue(s.WithOpName("dequeue"), queue,
+                            {DataType::DT_STRING});
+
+  // Add a IdentityReader.
+  ops::IdentityReader reader(s.WithOpName("identity_reader"));
+  ops::ReaderRead read(s.WithOpName("read_from_queue"), reader, queue);
+
+  Output var_mul = ops::MatMul(s.WithOpName("var_matmul"), a, b_read);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  QueueRunnerDef queue_runner;
+  queue_runner.set_queue_name("queue");
+  *queue_runner.add_enqueue_op_name() = "enqueue";
+  item.queue_runners.push_back(queue_runner);
+
+  item.init_ops.push_back("a/init/assign");
+  item.init_ops.push_back("b/init/assign");
+  item.fetch.push_back("var_matmul");
+  item.fetch.push_back("dequeue");
+
+  // Run the graph
+  TF_CHECK_OK(cluster_->Initialize(item));
+  EnableCPUAllocatorStats(true);
+
+  SessionOptions options =
+      GetSessionOption(3 /* cpu cores */, 0 /* num gpus */);
+  std::unordered_map<string, AllocatorStats> device_memory_before;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_before));
+  EXPECT_EQ(device_memory_before.size(), 1);
+
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check there is memory that is not released.
+  std::unordered_map<string, AllocatorStats> device_memory;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory));
+  EXPECT_EQ(device_memory.size(), 1);
+  EXPECT_GT(device_memory.begin()->second.bytes_in_use, 0);
+
+  // Reset cluster_ would release all memory.
+  cluster_.reset();
+  std::unordered_map<string, AllocatorStats> device_memory_after;
+  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_after));
+
+  // Check memory used by resources are released after cluster destruction.
+  EXPECT_EQ(device_memory_before.size(), 1);
+  EXPECT_EQ(device_memory_after.size(), 1);
+  EXPECT_EQ(device_memory_before.begin()->second.bytes_in_use, 0);
+  EXPECT_EQ(device_memory_after.begin()->second.bytes_in_use, 0);
+}
+#endif
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow

From b01346de8b5893a09d50ff4d9c80ca442a327a76 Mon Sep 17 00:00:00 2001
From: Kay Zhu <kayzhu@google.com>
Date: Tue, 25 Jul 2017 23:16:04 -0700
Subject: [PATCH 51/56] Automated g4 rollback of changelist 163164566

PiperOrigin-RevId: 163170549
---
 tensorflow/compiler/xla/BUILD             |   3 -
 tensorflow/compiler/xla/reference_util.cc | 241 +++++++++++++++-------
 2 files changed, 166 insertions(+), 78 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index ba90b13b383..e0a03a78f1d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -563,9 +563,6 @@ cc_library(
         ":xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_evaluator",
-        "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index c851c38ea4f..7ef5c6d916f 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -20,9 +20,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
-#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/math/math_util.h"
@@ -449,85 +446,179 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
     std::pair<int64, int64> kernel_stride, Padding padding,
     std::pair<int64, int64> lhs_dilation, std::pair<int64, int64> rhs_dilation,
     ConvolutionDimensionNumbers dnums) {
-  HloComputation::Builder b("ConvArray4DGeneralDimensionDilated");
-  auto lhs_literal = Literal::CreateR4FromArray4D<float>(lhs);
-  auto rhs_literal = Literal::CreateR4FromArray4D<float>(rhs);
+  std::array<int64, 4> lhs_dimensions{{lhs.n1(), lhs.n2(), lhs.n3(), lhs.n4()}};
+  std::array<int64, 4> rhs_dimensions{{rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()}};
 
-  std::array<int64, 2> ordered_kernel_strides;
-  std::array<int64, 2> ordered_input_dimensions;
-  std::array<int64, 2> ordered_kernel_dimensions;
-  if (dnums.kernel_spatial_dimensions(0) > dnums.kernel_spatial_dimensions(1)) {
-    ordered_kernel_strides[0] = kernel_stride.second;
-    ordered_kernel_strides[1] = kernel_stride.first;
-  } else {
-    ordered_kernel_strides[0] = kernel_stride.first;
-    ordered_kernel_strides[1] = kernel_stride.second;
+  const int64 ksy = kernel_stride.first;
+  const int64 ksx = kernel_stride.second;
+  const int64 dy = lhs_dilation.first;
+  const int64 dx = lhs_dilation.second;
+  const int64 dky = rhs_dilation.first;
+  const int64 dkx = rhs_dilation.second;
+  CHECK_GE(dky, 1);
+  CHECK_GE(dkx, 1);
+  CHECK_GE(dy, 1);
+  CHECK_GE(dx, 1);
+
+  // Get all dimension sizes in lhs and rhs based on the given convolution
+  // dimension configuration.
+  const int64 ix = window_util::DilatedBound(
+      lhs_dimensions[dnums.spatial_dimensions(1)], dx);
+  const int64 iy = window_util::DilatedBound(
+      lhs_dimensions[dnums.spatial_dimensions(0)], dy);
+  const int64 iz = lhs_dimensions[dnums.feature_dimension()];
+  const int64 samples = lhs_dimensions[dnums.batch_dimension()];
+  const int64 kx = window_util::DilatedBound(
+      rhs_dimensions[dnums.kernel_spatial_dimensions(1)], dkx);
+  const int64 ky = window_util::DilatedBound(
+      rhs_dimensions[dnums.kernel_spatial_dimensions(0)], dky);
+  const int64 oz = rhs_dimensions[dnums.kernel_output_feature_dimension()];
+  {
+    const int64 kiz = rhs_dimensions[dnums.kernel_input_feature_dimension()];
+    CHECK_EQ(kiz, iz);
   }
 
-  ordered_input_dimensions[0] =
-      lhs_literal->shape().dimensions(dnums.spatial_dimensions(0));
-  ordered_input_dimensions[1] =
-      lhs_literal->shape().dimensions(dnums.spatial_dimensions(1));
-  ordered_kernel_dimensions[0] =
-      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(0));
-  ordered_kernel_dimensions[1] =
-      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(1));
+  if (padding == Padding::kSame) {
+    // We reject same padding with kernel striding, since it's somewhat
+    // nonsensical. We can always follow up to implement this with the desired
+    // semantics if anybody actually uses it.
+    CHECK_EQ(1, ksy);
+    CHECK_EQ(1, ksx);
+  }
 
-  std::vector<std::pair<int64, int64>> paddings =
-      MakePadding(ordered_input_dimensions, ordered_kernel_dimensions,
-                  ordered_kernel_strides, padding);
-  CHECK_EQ(paddings.size(), 2);
-
-  Window window;
-
-  WindowDimension dim;
-  dim.set_size(
-      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(0)));
-  dim.set_stride(kernel_stride.first);
-  dim.set_padding_low(paddings[0].first);
-  dim.set_padding_high(paddings[0].second);
-  dim.set_window_dilation(rhs_dilation.first);
-  dim.set_base_dilation(lhs_dilation.first);
-  *window.add_dimensions() = dim;
-
-  WindowDimension dim2;
-  dim2.set_size(
-      rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(1)));
-  dim2.set_stride(kernel_stride.second);
-  dim2.set_padding_low(paddings[1].first);
-  dim2.set_padding_high(paddings[1].second);
-  dim2.set_window_dilation(rhs_dilation.second);
-  dim2.set_base_dilation(lhs_dilation.second);
-  *window.add_dimensions() = dim2;
-
-  const Shape& shape =
-      ShapeInference::InferConvolveShape(lhs_literal->shape(),
-                                         rhs_literal->shape(), window, dnums)
-          .ConsumeValueOrDie();
-
-  HloInstruction* lhs_instruction =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
-  HloInstruction* rhs_instruction =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
-
-  b.AddInstruction(HloInstruction::CreateConvolve(
-      shape, lhs_instruction, rhs_instruction, window, dnums));
-
-  HloEvaluator evaluator;
-  std::unique_ptr<Literal> result_literal =
-      evaluator.Evaluate(b.Build().get(), {}).ConsumeValueOrDie();
-
-  CHECK_EQ(ShapeUtil::Rank(result_literal->shape()), 4);
+  const int64 ox =
+      padding == Padding::kSame ? ix : window_util::StridedBound(ix, kx, ksx);
+  const int64 oy =
+      padding == Padding::kSame ? iy : window_util::StridedBound(iy, ky, ksy);
+  const int64 istartx =
+      padding == Padding::kValid ? 0 : kx % 2 == 0 ? -(kx / 2 - 1) : -kx / 2;
+  const int64 istarty =
+      padding == Padding::kValid ? 0 : ky % 2 == 0 ? -(ky / 2 - 1) : -ky / 2;
+  // Create the output result array and reset the values to 0.
+  std::array<int64, 4> result_dimensions;
+  result_dimensions[dnums.batch_dimension()] = samples;
+  result_dimensions[dnums.feature_dimension()] = oz;
+  result_dimensions[dnums.spatial_dimensions(0)] = oy;
+  result_dimensions[dnums.spatial_dimensions(1)] = ox;
   auto result =
-      MakeUnique<Array4D<float>>(result_literal->shape().dimensions(0),
-                                 result_literal->shape().dimensions(1),
-                                 result_literal->shape().dimensions(2),
-                                 result_literal->shape().dimensions(3));
+      MakeUnique<Array4D<float>>(result_dimensions[0], result_dimensions[1],
+                                 result_dimensions[2], result_dimensions[3]);
+  result->Fill(0.0);
 
-  result->Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
-    *value = result_literal->Get<float>(indices);
-  });
+  const auto is_int32 = [](int64 x) {
+    return x >= std::numeric_limits<int32>::min() &&
+           x <= std::numeric_limits<int32>::max();
+  };
 
+  // 64-bit idiv/mod are much more expensive x86-64 than 32-bit idiv/imod (at
+  // least on x86-64), so we avoid them where possible.
+  const auto fast_idiv64 = [&](int64 a, int64 b) {
+    if (is_int32(a) && is_int32(b)) {
+      return static_cast<int64>(static_cast<int32>(a) / static_cast<int32>(b));
+    }
+    return a / b;
+  };
+  const auto fast_imod64 = [&](int64 a, int64 b) {
+    if (is_int32(a) && is_int32(b)) {
+      return static_cast<int64>(static_cast<int32>(a) % static_cast<int32>(b));
+    }
+    return a % b;
+  };
+
+  // Lambda to access the lhs operand at the given 4D index.
+  const auto lhs_element = [&](int64 batch, int64 feature, int64 height,
+                               int64 width) {
+    if (fast_imod64(height, dy) != 0 || fast_imod64(width, dx) != 0) {
+      return 0.0f;
+    }
+
+    std::array<int64, 4> index;
+    index[dnums.batch_dimension()] = batch;
+    index[dnums.feature_dimension()] = feature;
+    index[dnums.spatial_dimensions(0)] = fast_idiv64(height, dy);
+    index[dnums.spatial_dimensions(1)] = fast_idiv64(width, dx);
+    return lhs(index[0], index[1], index[2], index[3]);
+  };
+
+  // Lambda to access the rhs operand at the given 4D index.  height_over_dky
+  // should be equal to height / dky, and width_over_dkx should be equal to
+  // width / dkx.  (This is an optimization to avoid doing divisions.)
+  const auto rhs_element =
+      [&](int64 kernel_output_feature, int64 kernel_input_feature, int64 height,
+          int64 width, int64 height_over_dky, int64 width_over_dkx) {
+        DCHECK_EQ(height % dky, 0);
+        DCHECK_EQ(width % dkx, 0);
+        DCHECK_EQ(height / dky, height_over_dky);
+        DCHECK_EQ(width / dkx, width_over_dkx);
+
+        std::array<int64, 4> index;
+        index[dnums.kernel_output_feature_dimension()] = kernel_output_feature;
+        index[dnums.kernel_input_feature_dimension()] = kernel_input_feature;
+        index[dnums.kernel_spatial_dimensions(0)] = height_over_dky;
+        index[dnums.kernel_spatial_dimensions(1)] = width_over_dkx;
+        return rhs(index[0], index[1], index[2], index[3]);
+      };
+
+  // Lambda to access the result data at the given 4D index.
+  const auto result_element = [&](int64 batch, int64 kernel_output_feature,
+                                  int64 height, int64 width) -> float& {
+    std::array<int64, 4> index;
+    index[dnums.batch_dimension()] = batch;
+    index[dnums.feature_dimension()] = kernel_output_feature;
+    index[dnums.spatial_dimensions(0)] = height;
+    index[dnums.spatial_dimensions(1)] = width;
+    return (*result)(index[0], index[1], index[2], index[3]);
+  };
+
+  for (int64 oyi = 0; oyi < oy; ++oyi) {
+    for (int64 oxi = 0; oxi < ox; ++oxi) {
+      for (int64 sample = 0; sample < samples; ++sample) {
+        for (int64 izi = 0; izi < iz; ++izi) {
+          for (int64 ozi = 0; ozi < oz; ++ozi) {
+            for (int64 kyi = 0, kyi_over_dky = 0; kyi < ky;
+                 kyi += dky, kyi_over_dky++) {
+              for (int64 kxi = 0, kxi_over_dkx = 0; kxi < kx;
+                   kxi += dkx, kxi_over_dkx++) {
+                int64 iyi = istarty + ksy * oyi + kyi;
+                int64 ixi = istartx + ksx * oxi + kxi;
+                float input = (iyi >= iy || ixi >= ix || iyi < 0 || ixi < 0)
+                                  ? 0.0
+                                  : lhs_element(sample, izi, iyi, ixi);
+                float gain =
+                    rhs_element(ozi, izi, kyi, kxi, kyi_over_dky, kxi_over_dkx);
+                float addend = input * gain;
+                result_element(sample, ozi, oyi, oxi) += addend;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  if (samples == 0 || kx == 0 || ky == 0 || ox == 0 || oy == 0 || oz == 0 ||
+      iz == 0) {
+    LOG(INFO) << "Output will be trivially empty because one of these "
+                 "dimensions is 0: samples: "
+              << samples << " kx: " << kx << " ky: " << ky << " ox: " << ox
+              << " oy: " << oy << " oz: " << oz << " iz: " << iz;
+    return result;
+  }
+  bool trivial = true;
+  auto check_trivial = [&trivial](tensorflow::gtl::ArraySlice<int64> indices,
+                                  float value) {
+    if (value != 0.0) {
+      trivial = false;
+    }
+  };
+  lhs.Each(check_trivial);
+  if (trivial) {
+    LOG(FATAL) << "LHS is all 0.0.";
+  }
+  trivial = true;
+  rhs.Each(check_trivial);
+  if (trivial) {
+    LOG(FATAL) << "RHS is all 0.0.";
+  }
   return result;
 }
 

From 5c41a7cea20e4e8e6b3cbe0824d5d67d56238394 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Wed, 26 Jul 2017 00:13:14 -0700
Subject: [PATCH 52/56] DirectSession: Add accessor to DeviceMgr.

This will enable some experimentation with keeping TF_Tensor objects
in the C API backed by device memory (avoiding copies to host memory).

PiperOrigin-RevId: 163173902
---
 tensorflow/core/common_runtime/direct_session.h     |  6 ++++--
 .../core/common_runtime/direct_session_test.cc      | 11 +++++++++++
 tensorflow/core/public/session.h                    | 13 +++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index c3cc573f2e6..cfc029132ae 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -98,14 +98,16 @@ class DirectSession : public Session {
   ::tensorflow::Status ListDevices(
       std::vector<DeviceAttributes>* response) override;
   ::tensorflow::Status Close() override;
+  ::tensorflow::Status LocalDeviceManager(const DeviceMgr** output) override {
+    *output = device_mgr_.get();
+    return ::tensorflow::Status::OK();
+  }
 
   void ExportCostModels(CostModelManager::CostModelMap* cost_models) {
     cost_model_manager_.ExportCostModels(cost_models);
   }
 
  private:
-  typedef DirectSession ME;
-
   // We create one executor and its dependent library runtime for
   // every partition.
   struct PerPartitionExecutorsAndLib {
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 8010a58f6c7..097dab8406f 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -1248,6 +1249,16 @@ TEST(DirectSessionTest, TestDirectSessionReset) {
   EXPECT_EQ("Cancelled: Session has been closed.", s.ToString());
 }
 
+TEST(DirectSessionTest, LocalDeviceManager) {
+  SessionOptions options;
+  std::unique_ptr<Session> session(NewSession(options));
+
+  const DeviceMgr* mgr = nullptr;
+  TF_ASSERT_OK(session->LocalDeviceManager(&mgr));
+  ASSERT_TRUE(mgr != nullptr);
+  EXPECT_GT(mgr->ListDevices().size(), 0);
+}
+
 // A simple benchmark for the overhead of `DirectSession::Run()` calls
 // with varying numbers of feeds/fetches.
 void FeedFetchBenchmarkHelper(int num_feeds, int iters) {
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index c1f097c7c68..7446fbdb514 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
+class DeviceMgr;
 
 /// \brief A Session instance lets a caller drive a TensorFlow graph
 /// computation.
@@ -177,12 +178,24 @@ class Session {
   /// *response. This API is optional. If it is unimplemented, Status will
   /// return a corresponding error message, and *response will be unmodified.
   virtual Status ListDevices(std::vector<DeviceAttributes>* response) = 0;
+
   /// \brief Closes this session.
   ///
   /// Closing a session releases the resources used by this session
   /// on the TensorFlow runtime (specified during session creation by
   /// the `SessionOptions::target` field).
   virtual Status Close() = 0;
+
+  // NOTE(ashankar): As of July 2017, this is was a method added to
+  // faciliate some experimentation. Reconsider/re-evaluate after
+  // September 2017.
+  //
+  // Sets `*output` to the `DeviceMgr` that owns accessible devices in the
+  // address-space of the caller.
+  virtual Status LocalDeviceManager(const DeviceMgr** output) {
+    return errors::Unimplemented(
+        "LocalDeviceManager is not supported for this session.");
+  }
 };
 
 /// \brief Create a new session with the given options.

From 02a5c34b1ececbd7e183a0f358e5605323735143 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 26 Jul 2017 03:48:28 -0700
Subject: [PATCH 53/56] New "Estimators" unit of Programmer's Guide.

PiperOrigin-RevId: 163189681
---
 .../docs_src/programmers_guide/estimators.md  | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 tensorflow/docs_src/programmers_guide/estimators.md

diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
new file mode 100644
index 00000000000..a5724ea294e
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -0,0 +1,153 @@
+# Estimators
+
+This document introduces **Estimators**--a high-level TensorFlow API that
+greatly simplifies machine learning programming. Estimators encapsulate
+the following actions:
+
+*   training
+*   evaluation
+*   prediction
+*   export for serving
+
+You may either use the pre-made Estimators we provide or write your
+own custom Estimators.  All Estimators--whether pre-made or custom--are
+classes based on the `tf.estimator.Estimator` class.
+
+Note: TensorFlow also provides an Estimator class at
+`tf.contrib.learn.Estimator`, which you should not use.</aside>
+
+
+## Advantages of Estimators
+
+Estimators provide the following benefits:
+
+*   You can run Estimators-based models on a local host or on a
+    distributed multi-server environment without changing your model.
+    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    or TPUs without recoding your model.
+*   Estimators simplify sharing implementations between model developers.
+*   You can develop a state of the art model with high-level intuitive code,
+    In short, it is generally much easier to create models with Estimators
+    than with the low-level TensorFlow APIs.
+*   Estimators are themselves built on tf.layers, which
+    simplifies customization.
+*   Estimators build the graph for you.  In other words, you don't have to
+    build the graph.
+*   Estimators provide a safe distributed training loop that controls how and
+    when to:
+    *   build the graph
+    *   initialize variables
+    *   start queues
+    *   handle exceptions
+    *   create checkpoint files and recover from failures
+    *   save summaries for TensorBoard
+
+When writing an application with Estimators, you must separate the data input
+pipeline from the model.  This separation simplifies experiments with
+different data sets.
+
+
+## Pre-made Estimators
+
+Pre-made Estimators enable you to work at a much higher conceptual level
+than the base TensorFlow APIs. You no longer have to worry about creating
+the computational graph or sessions since Estimators handle all
+the "plumbing" for you.  That is, pre-made Estimators create and manage
+`Graph` and `Session` objects for you.  Furthermore, pre-made Estimators
+let you experiment with different model architectures by making only minimal
+code changes.  `DNNClassifier`, for example, is a pre-made Estimator class that
+trains classification models through dense, feed-forward neural networks.
+
+
+### Structure of a pre-made Estimators program
+
+A TensorFlow program relying on a pre-made Estimator typically consists
+of the following four steps:
+
+1.  **Write one or more dataset importing functions.** For example, you might
+    create one function to import the training set and another function to
+    import the test set. Each dataset importing function must return two
+    objects:
+
+    *   a dictionary in which the keys are feature column names and the
+        values are Tensors (or SparseTensors) containing the corresponding
+        feature data
+    *   a Tensor containing one or more labels
+
+    For example, the following code illustrates the basic skeleton for
+    an input function:
+
+        def input_fn(dataset):
+           ...  # manipulate dataset, extracting feature names and the label
+           return feature_dict, label
+
+    See @{$datasets$Using the `Dataset` API for TensorFlow Input Pipelines}
+    for full details.)
+
+2.  **Define the feature columns.** Each @{tf.feature_column}
+    identifies a feature name, its type, and any input pre-processing.
+    For example, the following snippet creates three feature
+    columns that hold integer or floating-point data.  The first two
+    feature columns simply identify the feature's name and type. The
+    third feature column also specifies a lambda the program will invoke
+    to scale the raw data:
+
+        # Define three numeric feature columns.
+        population = tf.feature_column.numeric_column('population')
+        crime_rate = tf.feature_column.numeric_column('crime_rate')
+        median_education = tf.feature_column.numeric_column('median_education',
+                            normalizer_fn='lambda x: x - global_education_mean')
+
+3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
+    a sample instantiation of a pre-made Estimator named `LinearClassifier`:
+
+        # Instantiate an estimator, passing the feature columns.
+        estimator = tf.estimator.Estimator.LinearClassifier(
+            feature_columns=[population, crime_rate, median_education],
+            )
+
+4.  **Call a training, evaluation, or inference method.**
+    For example, all Estimators provide a `train` method, which trains a model.
+
+        # my_training_set is the function created in Step 1
+        estimator.train(input_fn=my_training_set, steps=2000)
+
+
+### Benefits of pre-made Estimators
+
+Pre-made Estimators encode best practices, providing the following benefits:
+
+*   Best practices for determining where different parts of the computational
+    graph should run, implementing strategies on a single machine or on a
+    cluster.
+*   Best practices for event (summary) writing and universally useful
+    summaries.
+
+If you don't use pre-made Estimators, you must implement the preceding
+features yourself.
+
+
+## Custom Estimators
+
+The heart of every Estimator--whether pre-made or custom--is its
+**model function**, which is a method that builds graphs for training,
+evaluation, and prediction. When you are using a pre-made Estimator,
+someone else has already implemented the model function. When relying
+on a custom Estimator, you must write the model function yourself. A
+${$extend/estimators$companion document)
+explains how to write the model function.
+
+
+## Recommended workflow
+
+We recommend the following workflow:
+
+1.  Assuming a suitable pre-made Estimator exists, use it to build your
+    first model and use its results to establish a baseline.
+2.  Build and test your overall pipeline, including the integrity and
+    reliability of your data with this pre-made Estimator.
+3.  If suitable alternative pre-made Estimators are available, run
+    experiments to determine which pre-made Estimator produces the
+    best results.
+4.  Possibly, further improve your model by building your own custom Estimator.
+

From 49495697cddef73a0dd870176dab488bb2a65520 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Wed, 26 Jul 2017 07:37:47 -0700
Subject: [PATCH 54/56] Replace local tag with requires-gpu tag for cuda and
 sycl tests

PiperOrigin-RevId: 163205190
---
 tensorflow/core/platform/default/build_config_root.bzl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index fa4ac4ba73f..e1f123a0a01 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -3,10 +3,10 @@
 # be separate to avoid cyclic references.
 
 def tf_cuda_tests_tags():
-  return ["local"]
+  return ["requires-gpu"]
 
 def tf_sycl_tests_tags():
-  return ["local"]
+  return ["requires-gpu"]
 
 def tf_additional_plugin_deps():
   return select({
@@ -28,7 +28,7 @@ def tf_additional_verbs_deps():
       "//tensorflow:with_verbs_support": [
           "//tensorflow/contrib/verbs:verbs_server_lib",
           "//tensorflow/contrib/verbs:grpc_verbs_client",
-      ], 
+      ],
       "//conditions:default": [],
   })
 

From 78a9b95436f45438abf3e818307f707e9ae92343 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 26 Jul 2017 08:35:19 -0700
Subject: [PATCH 55/56] [XLA] Finish normalizing fusion computations into
 standard computations

PiperOrigin-RevId: 163210327
---
 tensorflow/compiler/xla/service/BUILD         |   1 +
 .../xla/service/algebraic_simplifier.cc       |   3 +
 .../xla/service/batchnorm_rewriter.cc         |   3 +
 .../compiler/xla/service/buffer_assignment.cc |   6 +
 .../compiler/xla/service/buffer_liveness.cc   |   3 +
 .../compiler/xla/service/copy_insertion.cc    |   3 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  |   9 +
 .../cpu/cpu_parallelization_preparation.cc    |   3 +
 .../compiler/xla/service/gpu/fusion_merger.cc |   9 +-
 .../compiler/xla/service/gpu/hlo_schedule.cc  |   3 +-
 .../xla/service/hlo_constant_folding.cc       |   3 +
 .../xla/service/hlo_cost_analysis_test.cc     |  76 +++----
 tensorflow/compiler/xla/service/hlo_cse.cc    |   3 +
 tensorflow/compiler/xla/service/hlo_dce.cc    |   3 +
 .../compiler/xla/service/hlo_instruction.cc   |  69 +++---
 .../compiler/xla/service/hlo_instruction.h    |   4 -
 .../xla/service/hlo_instruction_test.cc       | 203 ++++++++++--------
 .../compiler/xla/service/hlo_ordering.cc      |   3 +
 .../xla/service/hlo_rematerialization.cc      |   3 +
 .../compiler/xla/service/hlo_scheduling.cc    |   4 +
 .../xla/service/instruction_fusion.cc         |  11 +-
 .../compiler/xla/service/layout_assignment.cc |   5 +
 .../compiler/xla/service/name_uniquer.cc      |   4 +
 .../xla/service/reduce_precision_insertion.cc |   3 +
 .../compiler/xla/service/reshape_mover.cc     |  11 +-
 .../xla/service/reshape_mover_test.cc         |  11 +-
 .../compiler/xla/service/transpose_folding.cc |   9 +-
 .../xla/service/tuple_points_to_analysis.cc   |   6 +
 28 files changed, 293 insertions(+), 181 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index a4612bb6c12..8fb0faf026b 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1210,6 +1210,7 @@ cc_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 4837402c15b..691f9f22964 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1586,6 +1586,9 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   // module, invalidating iteration.
   std::vector<HloComputation*> computations;
   for (auto& comp : module->computations()) {
+    if (comp->IsFusionComputation()) {
+      continue;
+    }
     computations.push_back(comp.get());
   }
   for (auto& comp : computations) {
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
index 5d5d3caa2f6..ca2d413e11d 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
@@ -268,6 +268,9 @@ StatusOr<bool> BatchNormRewriter::Run(HloModule* module) {
   // module, invalidating iteration.
   std::vector<HloComputation*> computations;
   for (auto& comp : module->computations()) {
+    if (comp->IsFusionComputation()) {
+      continue;
+    }
     computations.push_back(comp.get());
   }
   for (auto& comp : computations) {
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index ddc3d11b7c7..ae31135a1ae 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1219,6 +1219,9 @@ void BufferAssigner::BuildColocatedBufferSets(
   const TuplePointsToAnalysis& points_to_analysis =
       buffer_liveness.points_to_analysis();
   for (const HloComputation* computation : module->MakeComputationPostOrder()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     for (const HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       const HloOpcode opcode = instruction->opcode();
@@ -1386,6 +1389,9 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // their own BufferAllocation.
   for (auto* computation : thread_local_computations) {
     TF_RET_CHECK(computation != module->entry_computation());
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
         computation, module->config().debug_options(),
         /*is_thread_local=*/true, colocated_buffers, colocated_allocations,
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 6720a90ef85..f085ffa6bc4 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -47,6 +47,9 @@ StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
 tensorflow::Status BufferLiveness::Analyze() {
   TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
   for (auto& computation : module_->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     // Gather all instructions whose buffers might alias other instructions into
     // the set aliased_buffers_.  This includes those contained as a tuple
     // element in other instruction's output.
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index a3803c34ba7..c47abe9c62a 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -551,6 +551,9 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
   // Add copies of computation root instructions, if needed.
   FlatMap<const HloComputation*, ShapeTree<bool>> while_body_read_only_indices;
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     VLOG(2) << "computation " << computation->name();
     InstructionCopier root_copier(computation->root_instruction(),
                                   /*copy_users=*/{});
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index b86342d0b3e..59e8c75b916 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -519,6 +519,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         new std::map<HloInstruction*, string>());
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
+      if (embedded_computation->IsFusionComputation()) {
+        continue;
+      }
       auto parallel_computation_iter =
           parallel_computations.find(embedded_computation);
       // All parallel computations are considered to be an entry computation for
@@ -591,6 +594,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
 
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
+      if (embedded_computation->IsFusionComputation()) {
+        continue;
+      }
       TF_RETURN_IF_ERROR(
           ir_emitter
               .EmitComputation(embedded_computation,
@@ -755,6 +761,9 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
+      if (embedded_computation->IsFusionComputation()) {
+        continue;
+      }
       TF_RETURN_IF_ERROR(
           ir_emitter
               .EmitComputation(embedded_computation,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
index af931f7b013..4d0e0f744ac 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc
@@ -125,6 +125,9 @@ StatusOr<bool> ParallelizationPreparation::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     HloInstruction* root = computation->root_instruction();
     // Copy root instruction if it does not define its own top-level buffer.
     // TODO(b/32885001) Remove these copies (at least for the unambiguous case).
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index e698646d180..a9ef204b46f 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -293,12 +293,19 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
 StatusOr<bool> FusionMerger::Run(HloModule* module) {
   bool changed = false;
   VLOG(2) << "FusionMerger for module: " << module->name();
+  std::vector<HloComputation*> computations;
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (auto& computation : computations) {
     VLOG(1) << "Before running FusionInstructionMerger for computation: "
             << computation->name();
     XLA_VLOG_LINES(3, computation->ToString());
 
-    FusionInstructionMerger fusion_merger(computation.get());
+    FusionInstructionMerger fusion_merger(computation);
     TF_RETURN_IF_ERROR(fusion_merger.Run());
     changed |= fusion_merger.changed();
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
index c61e47a93ce..81e905a0666 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
@@ -120,7 +120,8 @@ GpuHloOrdering::GpuHloOrdering(
   // do that yet since it's hard to ensure that the order here is the order used
   // by IrEmitterNested. And mismatched ordering bugs would be hard to find.
   for (auto& computation : module->computations()) {
-    if (computation.get() != module->entry_computation()) {
+    if (computation.get() != module->entry_computation() &&
+        !computation->IsFusionComputation()) {
       predecessors_.emplace(computation.get(),
                             computation->ComputeReachability());
     }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 804efdd906a..1a2eed5f602 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -42,6 +42,9 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
   bool changed = false;
 
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     for (auto instruction : computation->MakeInstructionPostOrder()) {
       // Skip dead code.
       if (instruction->user_count() == 0 &&
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index f7456831651..0a288a77ada 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/logging.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
@@ -329,7 +330,7 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
   EXPECT_EQ(conv_analysis.flop_count(), matmul_analysis.flop_count());
 }
 
-using FusionCostAnalysis = ::testing::Test;
+using FusionCostAnalysis = HloTestBase;
 
 TEST_F(FusionCostAnalysis, LoopFusion) {
   // Do this 4 times with different per-second rates to test the computation of
@@ -345,32 +346,32 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
     //   mul = Mul(exp, C3)
     //   sub = Sub(mul, clamp)
     //   tuple = Tuple({sub, sub, mul, C1})
-    auto c1 = HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
-        /*from=*/0.0f, /*to=*/1.0f, /*rows=*/2, /*cols=*/2));
-    auto c2 = HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
-        /*from=*/1.0f, /*to=*/2.0f, /*rows=*/2, /*cols=*/2));
-    auto c3 = HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
-        /*from=*/2.0f, /*to=*/3.0f, /*rows=*/2, /*cols=*/2));
+    HloComputation::Builder builder(TestName());
+    auto c1 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+            /*from=*/0.0f, /*to=*/1.0f, /*rows=*/2, /*cols=*/2)));
+    auto c2 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+            /*from=*/1.0f, /*to=*/2.0f, /*rows=*/2, /*cols=*/2)));
+    auto c3 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+            /*from=*/2.0f, /*to=*/3.0f, /*rows=*/2, /*cols=*/2)));
+    auto add = builder.AddInstruction(
+        HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, c1, c2));
+    auto clamp = builder.AddInstruction(
+        HloInstruction::CreateTernary(r2f32, HloOpcode::kClamp, c2, add, add));
+    auto exp = builder.AddInstruction(
+        HloInstruction::CreateUnary(r2f32, HloOpcode::kExp, add));
+    auto mul = builder.AddInstruction(
+        HloInstruction::CreateBinary(r2f32, HloOpcode::kMultiply, exp, c3));
+    auto sub = builder.AddInstruction(
+        HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
+    auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    auto add = HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, c1.get(),
-                                            c2.get());
-    auto clamp = HloInstruction::CreateTernary(r2f32, HloOpcode::kClamp,
-                                               c2.get(), add.get(), add.get());
-    auto exp = HloInstruction::CreateUnary(r2f32, HloOpcode::kExp, add.get());
-    auto mul = HloInstruction::CreateBinary(r2f32, HloOpcode::kMultiply,
-                                            exp.get(), c3.get());
-    auto sub = HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract,
-                                            mul.get(), clamp.get());
-    auto tuple = HloInstruction::CreateTuple(
-        {sub.get(), sub.get(), mul.get(), c1.get()});
-
-    auto fusion = HloInstruction::CreateFusion(
-        r2f32, HloInstruction::FusionKind::kLoop, tuple.get());
-    fusion->FuseInstruction(sub.get());
-    fusion->FuseInstruction(mul.get());
-    fusion->FuseInstruction(exp.get());
-    fusion->FuseInstruction(clamp.get());
-    fusion->FuseInstruction(add.get());
+    HloModule module(TestName());
+    auto* computation = module.AddEntryComputation(builder.Build());
+    auto* fusion = computation->CreateFusionInstruction(
+        {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
 
     // The time given these rates at i == 0 is exactly even among the properties
     // at 1.0 seconds. For other values, one of the rates is slower so that it
@@ -398,18 +399,21 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   Shape shape_without_layout = shape_with_layout;
   shape_without_layout.clear_layout();
 
-  auto c1 = HloInstruction::CreateConstant(
-      Literal::CreateR4FromArray4D(Array4D<float>(2, 3, 4, 5)));
-  auto c2 = HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3}));
+  HloComputation::Builder builder(TestName());
+  auto c1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR4FromArray4D(Array4D<float>(2, 3, 4, 5))));
+  auto c2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3})));
 
-  auto broadcast =
-      HloInstruction::CreateBroadcast(shape_without_layout, c2.get(), {1});
-  auto add = HloInstruction::CreateBinary(shape_with_layout, HloOpcode::kAdd,
-                                          c1.get(), broadcast.get());
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape_without_layout, c2, {1}));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  auto fusion = HloInstruction::CreateFusion(
-      shape_with_layout, HloInstruction::FusionKind::kLoop, add.get());
-  fusion->FuseInstruction(broadcast.get());
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {add, broadcast}, HloInstruction::FusionKind::kLoop);
 
   HloCostAnalysis fusion_analysis(ShapeSize);
   ASSERT_IS_OK(fusion->Accept(&fusion_analysis));
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 0fef89a06d0..690c084efb1 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -92,6 +92,9 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
 StatusOr<bool> HloCSE::Run(HloModule* module) {
   bool changed = false;
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     changed |= CombineConstants(computation.get(), is_layout_sensitive_);
 
     std::list<HloInstruction*> post_order =
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 3755b9e4c00..5b2c57da4ff 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -38,6 +38,9 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
   bool changed = false;
 
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     std::unordered_set<HloInstruction*> live_instructions;
     TF_RETURN_IF_ERROR(computation->root_instruction()->Accept(
         [&live_instructions](HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index f52882cca56..ed8a942d03a 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -560,19 +560,20 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     HloInstruction* instruction_to_fuse) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(instruction_to_fuse->IsFusable());
-
+  if (GetModule()) {
+    XLA_VLOG_LINES(1, GetModule()->ToString());
+  }
   HloInstruction* clone = nullptr;
-  if (fused_instructions_computation_ == nullptr) {
+  if (called_computations_.empty()) {
     // New fusion instruction.
     auto builder = HloComputation::Builder("fused_computation", true);
     builder.AddInstruction(instruction_to_fuse->Clone(/*suffix=*/""));
-    fused_instructions_computation_ = builder.Build();
+    called_computations_.push_back(
+        CHECK_NOTNULL(GetModule())->AddEmbeddedComputation(builder.Build()));
     clone = fused_expression_root();
     clone->parent_fusion_instruction_ = this;
   } else {
-    CHECK(fused_instructions_computation_ != nullptr &&
-          fused_instructions_computation_->IsFusionComputation());
-    clone = fused_instructions_computation_->AddInstruction(
+    clone = fused_instructions_computation()->AddInstruction(
         instruction_to_fuse->Clone(/*suffix=*/""));
     clone->parent_fusion_instruction_ = this;
     // instruction_to_fuse is necessarily an operand of the fusion instruction.
@@ -583,7 +584,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     CHECK(std::find(operands_.begin(), operands_.end(), instruction_to_fuse) !=
           operands_.end());
     const std::vector<HloInstruction*>& fused_parameters_ =
-        fused_instructions_computation_->parameter_instructions();
+        fused_instructions_computation()->parameter_instructions();
     for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) {
       if (instruction_to_fuse == operands_[operand_num]) {
         // replace the fused parameter instruction's uses with the clone.
@@ -593,7 +594,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
         // Remove the corresponding fused parameter and operand from their
         // respective vectors.
         TF_CHECK_OK(
-            fused_instructions_computation_->RemoveParameter(operand_num));
+            fused_instructions_computation()->RemoveParameter(operand_num));
         operands_.erase(operands_.begin() + operand_num);
         break;
       }
@@ -605,7 +606,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
 
   // Reread the parameters in the computation.
   const std::vector<HloInstruction*>& fused_parameters_ =
-      fused_instructions_computation_->parameter_instructions();
+      fused_instructions_computation()->parameter_instructions();
 
   // Add each operand of the clone as an operand of the fusion instruction. A
   // complication is that some clone operands may already be operands of the
@@ -638,7 +639,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
           CreateParameter(param_no, operand->shape(), param_name);
 
       param_instruction->parent_fusion_instruction_ = this;
-      fused_param = fused_instructions_computation_->AddParameter(
+      fused_param = fused_instructions_computation()->AddParameter(
           std::move(param_instruction));
       AppendOperand(operand);
     }
@@ -652,7 +653,6 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       called_computations_.push_back(computation);
     }
   }
-
   return clone;
 }
 
@@ -663,17 +663,15 @@ RandomDistribution HloInstruction::random_distribution() const {
 
 void HloInstruction::CheckFusionInstruction() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
 
   const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
-      fused_instructions_computation_->instructions();
+      fused_instructions_computation()->instructions();
   // All instructions owned by this fusion instruction must be fused, and the
   // parent fusion instruction of the fused instructions must be 'this'.
   for (auto& instruction : fused_instructions_) {
     CHECK(instruction->IsFused());
     CHECK_EQ(this, instruction->fusion_instruction());
-    CHECK_EQ(fused_instructions_computation_.get(), instruction->parent())
+    CHECK_EQ(fused_instructions_computation(), instruction->parent())
         << instruction->ToString();
   }
 
@@ -976,8 +974,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(parent() != nullptr);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
 
   auto new_instruction =
       WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
@@ -992,9 +988,9 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
   // fused instructions.
   std::vector<HloInstruction*> new_fused_parameters;
   const std::vector<HloInstruction*>& fused_parameters_ =
-      fused_instructions_computation_->parameter_instructions();
+      fused_instructions_computation()->parameter_instructions();
   const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
-      fused_instructions_computation_->instructions();
+      fused_instructions_computation()->instructions();
 
   for (HloInstruction* old_fused_parameter : fused_parameters_) {
     new_fused_instructions.push_back(old_fused_parameter->Clone());
@@ -1028,7 +1024,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
   }
   new_instruction->fusion_kind_ = fusion_kind_;
   auto computation_builder = HloComputation::Builder(
-      fused_instructions_computation_->name() + ".clone", true);
+      fused_instructions_computation()->name() + ".clone", true);
   // We iterated the fusion instructions in reverse post order which means
   // that we must reverse our new list of fusion instructions.
   for (auto new_fused_instruction_iter = new_fused_instructions.rbegin();
@@ -1037,8 +1033,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     computation_builder.AddInstruction(std::move(*new_fused_instruction_iter));
   }
   auto fused_root_ = fused_expression_root();
-  new_instruction->fused_instructions_computation_ =
-      computation_builder.Build(FindOrDie(old_to_new, fused_root_));
+  new_instruction->called_computations_.push_back(
+      CHECK_NOTNULL(GetModule())
+          ->AddEmbeddedComputation(
+              computation_builder.Build(FindOrDie(old_to_new, fused_root_))));
   new_instruction->set_parent(parent());
   new_instruction->CheckFusionInstruction();
   return new_instruction;
@@ -1769,7 +1767,10 @@ bool HloInstruction::IsFusable() const {
 
 HloComputation* HloInstruction::fused_instructions_computation() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  return fused_instructions_computation_.get();
+  CHECK(!called_computations_.empty());
+  auto* fused_instructions_computation = called_computations_.front();
+  CHECK(fused_instructions_computation->IsFusionComputation());
+  return fused_instructions_computation;
 }
 
 HloInstruction* HloInstruction::fusion_instruction() const {
@@ -1779,32 +1780,24 @@ HloInstruction* HloInstruction::fusion_instruction() const {
 
 HloInstruction* HloInstruction::fused_expression_root() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->root_instruction();
+  return fused_instructions_computation()->root_instruction();
 }
 
 HloInstruction* HloInstruction::fused_parameter(int64 parameter_number) const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->parameter_instruction(
+  return fused_instructions_computation()->parameter_instruction(
       parameter_number);
 }
 
 const std::vector<HloInstruction*>& HloInstruction::fused_parameters() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->parameter_instructions();
+  return fused_instructions_computation()->parameter_instructions();
 }
 
 const std::list<std::unique_ptr<HloInstruction>>&
 HloInstruction::fused_instructions() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(fused_instructions_computation_ != nullptr &&
-        fused_instructions_computation_->IsFusionComputation());
-  return fused_instructions_computation_->instructions();
+  return fused_instructions_computation()->instructions();
 }
 
 HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
@@ -2039,7 +2032,7 @@ static Status PostOrderDFS(HloInstruction* root, DfsHloVisitor* visitor,
 
 Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit,
                               bool ignore_control_predecessors) {
-  VLOG(2) << "HloInstruction::Accept(" << name() << ")";
+  VLOG(3) << "HloInstruction::Accept(" << name() << ")";
   TF_RETURN_IF_ERROR(
       PostOrderDFS(this, visitor, nullptr, ignore_control_predecessors));
   if (call_finish_visit) {
@@ -2055,8 +2048,11 @@ Status HloInstruction::AcceptWithOperandOrder(
   TF_RETURN_IF_ERROR(PostOrderDFS(this, visitor, &operand_order,
                                   /*ignore_control_predecessors=*/false));
   if (call_finish_visit) {
+    VLOG(3) << "HloInstruction::AcceptWithOperandOrder BEFORE FINISH VISIT";
     TF_RETURN_IF_ERROR(visitor->FinishVisit(this));
+    VLOG(3) << "HloInstruction::AcceptWithOperandOrder AFTER FINISH VISIT";
   }
+  VLOG(2) << "HloInstruction::AcceptWithOperandOrder EXIT";
   return Status::OK();
 }
 
@@ -2458,6 +2454,7 @@ HloModule* HloInstruction::GetModule() const {
 }
 
 void HloInstruction::UniquifyName(NameUniquer* name_uniquer) {
+  string parent_str = parent() == nullptr ? "noparent" : parent()->name();
   name_ = name_uniquer->GetUniqueName(name_);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index e2e77e5219c..3c188ec83f3 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -935,10 +935,6 @@ class HloInstruction {
   // padding of this pad instruction. Only set for pad instructions.
   std::unique_ptr<PaddingConfig> padding_config_;
 
-  // The computation that stores of instructions fused into this fusion
-  // instruction. Only set for fusion instructions.
-  std::unique_ptr<HloComputation> fused_instructions_computation_;
-
   // If this instruction is fused into a fusion instruction, this field points
   // to the fusion instruction.
   HloInstruction* parent_fusion_instruction_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index bb1b477e139..5951c833dba 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -557,78 +557,89 @@ TEST_F(HloInstructionTest, PostProcessAllVisitedNodes) {
 }
 
 TEST_F(HloInstructionTest, SingletonFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Create a fusion instruction containing a single unary operation.
-  auto constant =
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
-  auto exp =
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp}, HloInstruction::FusionKind::kLoop);
 
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, exp.get());
-
-  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
-  EXPECT_THAT(constant->users(), UnorderedElementsAre(fusion.get(), exp.get()));
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant));
+  EXPECT_THAT(constant->users(), ElementsAre(fusion));
 }
 
 TEST_F(HloInstructionTest, BinaryFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Create a fusion instruction containing a single binary operation.
-  auto constant1 =
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
-  auto constant2 =
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.1f));
-  auto add = HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
-                                          constant1.get(), constant2.get());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.1f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant1, constant2));
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {add}, HloInstruction::FusionKind::kLoop);
 
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, add.get());
-
-  EXPECT_THAT(fusion->operands(),
-              ElementsAre(constant1.get(), constant2.get()));
-  EXPECT_THAT(constant1->users(),
-              UnorderedElementsAre(fusion.get(), add.get()));
-  EXPECT_THAT(constant2->users(),
-              UnorderedElementsAre(fusion.get(), add.get()));
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant1, constant2));
+  EXPECT_THAT(constant1->users(), ElementsAre(fusion));
+  EXPECT_THAT(constant2->users(), ElementsAre(fusion));
 }
 
 TEST_F(HloInstructionTest, ChainFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Create a chain of fused unary ops.
-  auto constant =
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
-  auto exp1 =
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
-  auto exp2 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1.get());
-  auto exp3 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2.get());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto exp1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
+  auto exp2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1));
+  auto exp3 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2));
 
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, exp3.get());
-  fusion->FuseInstruction(exp2.get());
-  fusion->FuseInstruction(exp1.get());
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp3, exp2, exp1}, HloInstruction::FusionKind::kLoop);
 
-  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
-  EXPECT_THAT(constant->users(),
-              UnorderedElementsAre(fusion.get(), exp1.get()));
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant));
+  EXPECT_THAT(constant->users(), ElementsAre(fusion));
 }
 
 TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
+  HloComputation::Builder builder(TestName());
   // Create a chain of fused unary ops.
-  auto constant =
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
-  auto exp1 =
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
-  auto exp2 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1.get());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto exp1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
+  auto exp2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1));
   OpMetadata metadata;
   metadata.set_op_name("tf_op");
   exp1->set_metadata(metadata);
   exp2->set_metadata(metadata);
 
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, exp2.get());
-  auto* fused = fusion->FuseInstruction(exp1.get());
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp2, exp1}, HloInstruction::FusionKind::kLoop);
+
   EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fusion->metadata()));
-  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fused->metadata()));
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(
+      metadata, fusion->fused_expression_root()->metadata()));
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(
+      metadata, fusion->fused_expression_root()->operand(0)->metadata()));
 }
 
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
+  HloComputation::Builder builder(TestName());
   // Create a fusion instruction containing a single unary operation.
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
 
@@ -642,33 +653,36 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   std::unique_ptr<HloComputation> computation_x = make_map_computation();
   std::unique_ptr<HloComputation> computation_y = make_map_computation();
 
-  auto constant =
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
-  auto map_1_x =
-      HloInstruction::CreateMap(scalar_shape, {constant.get()},
-                                computation_x.get(), /*static_operands=*/{});
-  auto map_2_x =
-      HloInstruction::CreateMap(scalar_shape, {map_1_x.get()},
-                                computation_x.get(), /*static_operands=*/{});
-  auto map_3_y =
-      HloInstruction::CreateMap(scalar_shape, {map_2_x.get()},
-                                computation_y.get(), /*static_operands=*/{});
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto map_1_x = builder.AddInstruction(HloInstruction::CreateMap(
+      scalar_shape, {constant}, computation_x.get(), /*static_operands=*/{}));
+  auto map_2_x = builder.AddInstruction(HloInstruction::CreateMap(
+      scalar_shape, {map_1_x}, computation_x.get(), /*static_operands=*/{}));
+  auto map_3_y = builder.AddInstruction(HloInstruction::CreateMap(
+      scalar_shape, {map_2_x}, computation_y.get(), /*static_operands=*/{}));
 
-  auto fusion = HloInstruction::CreateFusion(
-      scalar_shape, HloInstruction::FusionKind::kLoop, map_3_y.get());
-
-  EXPECT_THAT(fusion->called_computations(), ElementsAre(computation_y.get()));
-
-  fusion->FuseInstruction(map_2_x.get());
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {map_3_y}, HloInstruction::FusionKind::kLoop);
+  auto* fused_computation = fusion->fused_instructions_computation();
   EXPECT_THAT(fusion->called_computations(),
-              ElementsAre(computation_y.get(), computation_x.get()));
+              ElementsAre(fused_computation, computation_y.get()));
 
-  fusion->FuseInstruction(map_1_x.get());
-  EXPECT_THAT(fusion->called_computations(),
-              ElementsAre(computation_y.get(), computation_x.get()));
+  fusion->FuseInstruction(map_2_x);
+  EXPECT_THAT(
+      fusion->called_computations(),
+      ElementsAre(fused_computation, computation_y.get(), computation_x.get()));
+
+  fusion->FuseInstruction(map_1_x);
+  EXPECT_THAT(
+      fusion->called_computations(),
+      ElementsAre(fused_computation, computation_y.get(), computation_x.get()));
 }
 
 TEST_F(HloInstructionTest, ComplexFusionOp) {
+  HloComputation::Builder builder(TestName());
   // Fuse all instructions in complicated expression:
   //
   //   add = Add(C1, C2)
@@ -680,35 +694,35 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   //
   // Notable complexities are repeated operands in a same instruction, different
   // shapes, use of value in different expressions.
-  auto c1 = HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f));
-  auto c2 = HloInstruction::CreateConstant(Literal::CreateR0<float>(2.1f));
-  auto c3 = HloInstruction::CreateConstant(Literal::CreateR0<float>(9.0f));
+  auto c1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto c2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.1f)));
+  auto c3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(9.0f)));
 
-  auto add =
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c1.get(), c2.get());
-  auto clamp = HloInstruction::CreateTernary(r0f32_, HloOpcode::kClamp,
-                                             c2.get(), add.get(), add.get());
-  auto exp = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, add.get());
-  auto mul = HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply,
-                                          exp.get(), c3.get());
-  auto sub = HloInstruction::CreateBinary(r0f32_, HloOpcode::kSubtract,
-                                          mul.get(), clamp.get());
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c1, c2));
+  auto clamp = builder.AddInstruction(
+      HloInstruction::CreateTernary(r0f32_, HloOpcode::kClamp, c2, add, add));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, add));
+  auto mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply, exp, c3));
+  auto sub = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kSubtract, mul, clamp));
   auto tuple =
-      HloInstruction::CreateTuple({sub.get(), sub.get(), mul.get(), c1.get()});
+      builder.AddInstruction(HloInstruction::CreateTuple({sub, sub, mul, c1}));
 
-  auto fusion = HloInstruction::CreateFusion(
-      r0f32_, HloInstruction::FusionKind::kLoop, tuple.get());
-  fusion->FuseInstruction(sub.get());
-  fusion->FuseInstruction(mul.get());
-  fusion->FuseInstruction(exp.get());
-  fusion->FuseInstruction(clamp.get());
-  fusion->FuseInstruction(add.get());
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {tuple, sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
 
   // Operands in the fusion instruction's operands() vector should be in the
   // order in which their users were added fused.
-  EXPECT_THAT(fusion->operands(), ElementsAre(c1.get(), c3.get(), c2.get()));
-  EXPECT_THAT(c1->users(),
-              UnorderedElementsAre(add.get(), tuple.get(), fusion.get()));
+  EXPECT_THAT(fusion->operands(), ElementsAre(c1, c3, c2));
+  EXPECT_THAT(c1->users(), ElementsAre(fusion));
 }
 
 // Convenience function for comparing two HloInstructions inside of
@@ -864,7 +878,8 @@ TEST_F(HloInstructionTest, PartiallyElementwise) {
   HloInstruction* max = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kMaximum, div, broadcast));
 
-  auto computation = builder.Build();
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {max, broadcast, div, mul}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -906,7 +921,8 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) {
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kSubtract, min, broadcast));
 
-  auto computation = builder.Build();
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {sub, broadcast, min}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -945,7 +961,8 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   HloInstruction* dot = builder.AddInstruction(
       HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
 
-  auto computation = builder.Build();
+  HloModule module(TestName());
+  auto* computation = module.AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 7230682d0b1..4c3ff3bdafc 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -183,6 +183,9 @@ DependencyHloOrdering::DependencyHloOrdering(const HloModule* module)
   // ordering based on dependencies. ExecutesBefore will return true iff there
   // exists a path in the HLO computation graph from 'a' to 'b'.
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     predecessors_.emplace(computation.get(),
                           computation->ComputeReachability());
   }
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index d19e8034acd..fd08796e503 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1202,6 +1202,9 @@ StatusOr<bool> HloRematerialization::Run(
   // After DCE, the module sequence may include instructions which no longer
   // exist.
   for (const auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     if (sequence->at(computation.get()).size() !=
         computation->instruction_count()) {
       // A size mismatch between the computation instruction count and the size
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 17f55f9cfb1..922236ee1e7 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -400,6 +400,9 @@ CreateMemoryMinimizingSequence(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
   for (const auto& computation : module.computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     TF_ASSIGN_OR_RETURN(sequence[computation.get()],
                         CreateMemoryMinimizingSequence(
                             *computation, *points_to_analysis, size_function));
@@ -410,6 +413,7 @@ CreateMemoryMinimizingSequence(
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const LogicalBuffer::SizeFunction& size_function) {
+  CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
   return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 482ab9b94ae..24af07bd4bf 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -211,8 +211,17 @@ bool InstructionFusion::CanFuseOnAllPaths(
 
 StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   bool changed = false;
+
+  std::vector<HloComputation*> computations;
   for (auto& computation : module->computations()) {
-    computation_ = computation.get();
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (auto& computation : computations) {
+    CHECK(!computation->IsFusionComputation());
+    computation_ = computation;
 
     // We want to be able to remove arbitrary instructions from the post order
     // and also compare positions of instructions in the post order. To make
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index aafface0b9f..7d41be94ce9 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -611,6 +611,9 @@ Status CheckLayouts(
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     for (auto& instruction : computation->instructions()) {
       // Verify every instruction has a layout and the layout is valid for the
       // shape.
@@ -1356,6 +1359,8 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
     if (computation == module->entry_computation()) {
       TF_RETURN_IF_ERROR(RunOnComputation(*entry_computation_layout_,
                                           module->entry_computation()));
+    } else if (computation->IsFusionComputation()) {
+      continue;
     } else {
       ComputationLayout computation_layout(computation->ComputeProgramShape());
       // Setting all embedded computations to the default layout is potentially
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index 4014856b9b2..069f85af721 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -29,7 +29,11 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
     return root;
   } else {
     tensorflow::strings::StrAppend(&root, separator_, *count);
+    // Increment lookup under old 'root' name.
     (*count)++;
+    // Initialize count under new 'root' name.
+    count = &(generated_names_[root]);
+    *count = 1;
     return root;
   }
 }
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index e083226b14d..9f12471ffd7 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -26,6 +26,9 @@ StatusOr<bool> ReducePrecisionInsertion::Run(HloModule* module) {
   VLOG(1) << "Running ReducePrecisionInsertion pass on " << module->name();
 
   for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     std::vector<HloInstruction*> instructions_to_suffix;
 
     for (auto& instruction : computation->instructions()) {
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 2d35ba5e548..1c648d58c7f 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -312,10 +312,17 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
   bool changed = false;
-  for (const auto& comp : module->computations()) {
+  std::vector<HloComputation*> computations;
+  for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (const auto& comp : computations) {
     for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
       TF_ASSIGN_OR_RETURN(bool did_change,
-                          TrySinkReshapeOrTranspose(comp.get(), instruction));
+                          TrySinkReshapeOrTranspose(comp, instruction));
       changed |= did_change;
     }
   }
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 49c17555202..1589d52a256 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -351,16 +351,15 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
-  auto fusion = computation->AddInstruction(HloInstruction::CreateFusion(
-      add->shape(), HloInstruction::FusionKind::kLoop, add));
-  TF_CHECK_OK(computation->ReplaceInstruction(add, fusion));
+  HloModule module(TestName());
+  auto computation = module.AddEntryComputation(builder.Build());
+  computation->CreateFusionInstruction({add},
+                                       HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(),
               op::Fusion(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(&module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Fusion(param0, param1)));
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index a0c88c6bbc2..58583357360 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -172,7 +172,14 @@ StatusOr<bool> TransposeFolding::Run(HloModule* module) {
     return tensorflow::Status::OK();
   };
 
-  for (auto& comp : module->computations()) {
+  std::vector<HloComputation*> computations;
+  for (auto& computation : module->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    computations.push_back(computation.get());
+  }
+  for (auto& comp : computations) {
     TF_RETURN_IF_ERROR(comp->Accept(visit_fn));
   }
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 182e99cf1ca..3c4dc19aefa 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -135,6 +135,9 @@ TuplePointsToAnalysis::Run(const HloModule* module) {
 Status TuplePointsToAnalysis::Analyze() {
   points_to_.clear();
   for (auto& computation : module_->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     TF_RETURN_IF_ERROR(computation->Accept(this));
     TF_RETURN_IF_ERROR(
         PopulateDefinedBuffersAndAliases(computation->instructions()));
@@ -451,6 +454,9 @@ string TuplePointsToAnalysis::ToString() const {
   string output = tensorflow::strings::Printf(
       "TuplePointsToSet for module %s:\n", module_->name().c_str());
   for (const auto& computation : module_->computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
     const char* entry =
         computation.get() == module_->entry_computation() ? "entry " : "";
     tensorflow::strings::StrAppend(&output, entry, "computation ",

From 57e38b03adcac52dedbe6bf350f8fa916f73df6d Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Wed, 26 Jul 2017 09:03:08 -0700
Subject: [PATCH 56/56] Create new tf.learn.estimator.head which is based on a
 tf.Distribution instance.

PiperOrigin-RevId: 163213141
---
 tensorflow/contrib/bayesflow/BUILD            |   8 +-
 .../python/kernel_tests/entropy_test.py       |  43 ++--
 tensorflow/contrib/distributions/BUILD        |  25 ++-
 tensorflow/contrib/distributions/__init__.py  |   2 +
 .../python/kernel_tests/estimator_test.py     | 114 +++++++++++
 .../distributions/python/ops/estimator.py     | 185 ++++++++++++++++++
 .../python/ops/mvn_linear_operator.py         |   4 +-
 .../learn/python/learn/estimators/head.py     |   8 +-
 8 files changed, 358 insertions(+), 31 deletions(-)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/estimator.py

diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 1cd6e64b32e..d324c7d0d09 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -35,23 +35,20 @@ py_library(
 
 cuda_py_test(
     name = "csiszar_divergence_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/csiszar_divergence_test.py"],
     additional_deps = [
         ":bayesflow_py",
         "//third_party/py/numpy",
         "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -84,12 +81,11 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
 )
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
index 6cdaa318705..0bd12b84d12 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
@@ -20,22 +20,24 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.contrib import layers as layers_lib
-from tensorflow.contrib.bayesflow.python.ops import entropy_impl as entropy_lib
+from tensorflow.contrib.bayesflow.python.ops import entropy_impl as entropy
+from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_diag_lib
+from tensorflow.contrib.distributions.python.ops import mvn_tril as mvn_tril_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler as kullback_leibler_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
 layers = layers_lib
-entropy = entropy_lib
 
 
-class NormalNoEntropy(distributions.Normal):  # pylint: disable=no-init
+class NormalNoEntropy(normal_lib.Normal):  # pylint: disable=no-init
   """Normal distribution without a `.entropy` method."""
 
   def entropy(self):
@@ -81,10 +83,10 @@ class ElboRatioTest(test.TestCase):
     n_samples = 5000
 
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
-      p = distributions.MultivariateNormalDiag(
+      p = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
 
@@ -95,7 +97,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.sample,
           seed=42)
-      actual_kl = distributions.kl_divergence(q, p)
+      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
@@ -109,10 +111,10 @@ class ElboRatioTest(test.TestCase):
 
     vector_shape = (2, 3)
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
-      p = distributions.MultivariateNormalDiag(
+      p = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
 
@@ -123,7 +125,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.analytic_entropy,
           seed=42)
-      actual_kl = distributions.kl_divergence(q, p)
+      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
@@ -135,7 +137,7 @@ class ElboRatioTest(test.TestCase):
 
     vector_shape = (2, 3)
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
 
@@ -155,7 +157,7 @@ class EntropyShannonTest(test.TestCase):
 
   def test_normal_entropy_default_form_uses_exact_entropy(self):
     with self.test_session():
-      dist = distributions.Normal(loc=1.11, scale=2.22)
+      dist = normal_lib.Normal(loc=1.11, scale=2.22)
       mc_entropy = entropy.entropy_shannon(dist, n=11)
       exact_entropy = dist.entropy()
       self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
@@ -163,7 +165,7 @@ class EntropyShannonTest(test.TestCase):
 
   def test_normal_entropy_analytic_form_uses_exact_entropy(self):
     with self.test_session():
-      dist = distributions.Normal(loc=1.11, scale=2.22)
+      dist = normal_lib.Normal(loc=1.11, scale=2.22)
       mc_entropy = entropy.entropy_shannon(
           dist, form=entropy.ELBOForms.analytic_entropy)
       exact_entropy = dist.entropy()
@@ -173,7 +175,7 @@ class EntropyShannonTest(test.TestCase):
   def test_normal_entropy_sample_form_gets_approximate_answer(self):
     # Tested by showing we get a good answer that is not exact.
     with self.test_session():
-      dist = distributions.Normal(loc=1.11, scale=2.22)
+      dist = normal_lib.Normal(loc=1.11, scale=2.22)
       mc_entropy = entropy.entropy_shannon(
           dist, n=1000, form=entropy.ELBOForms.sample, seed=0)
       exact_entropy = dist.entropy()
@@ -193,7 +195,7 @@ class EntropyShannonTest(test.TestCase):
       # NormalNoEntropy is like a Normal, but does not have .entropy method, so
       # we are forced to fall back on sample entropy.
       dist_no_entropy = NormalNoEntropy(loc=1.11, scale=2.22)
-      dist_yes_entropy = distributions.Normal(loc=1.11, scale=2.22)
+      dist_yes_entropy = normal_lib.Normal(loc=1.11, scale=2.22)
 
       mc_entropy = entropy.entropy_shannon(
           dist_no_entropy, n=1000, form=entropy.ELBOForms.sample, seed=0)
@@ -222,15 +224,16 @@ class RenyiRatioTest(test.TestCase):
     mu_true = np.array([1.0, -1.0], dtype=np.float64)
     chol_true = np.array([[2.0, 0.0], [0.5, 1.0]], dtype=np.float64)
     with self.test_session() as sess:
-      target = distributions.MultivariateNormalTriL(mu_true, chol_true)
+      target = mvn_tril_lib.MultivariateNormalTriL(mu_true, chol_true)
 
       # Set up q distribution by defining mean/covariance as Variables
       mu = variables.Variable(
           np.zeros(mu_true.shape), dtype=mu_true.dtype, name='mu')
       mat = variables.Variable(
           np.zeros(chol_true.shape), dtype=chol_true.dtype, name='mat')
-      chol = distributions.matrix_diag_transform(mat, transform=nn_ops.softplus)
-      q = distributions.MultivariateNormalTriL(mu, chol)
+      chol = distribution_util.matrix_diag_transform(
+          mat, transform=nn_ops.softplus)
+      q = mvn_tril_lib.MultivariateNormalTriL(mu, chol)
       for alpha in [0.25, 0.75]:
 
         negative_renyi_divergence = entropy.renyi_ratio(
@@ -262,7 +265,7 @@ class RenyiRatioTest(test.TestCase):
     n = 1000
     vector_shape = (2, 3)
     with self.test_session():
-      q = distributions.MultivariateNormalDiag(
+      q = mvn_diag_lib.MultivariateNormalDiag(
           loc=self._rng.rand(*vector_shape),
           scale_diag=self._rng.rand(*vector_shape))
       for alpha in [0.25, 0.75]:
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index e29456f4e6d..de1956750d8 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -36,27 +36,48 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":bijectors_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/learn",
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:clip_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+cuda_py_test(
+    name = "estimator_test",
+    size = "small",
+    srcs = ["python/kernel_tests/estimator_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/contrib/learn:head_test",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:session",
+    ],
+)
+
 cuda_py_test(
     name = "distribution_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index f4ebb3cb595..7a2aebddd25 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.distributions.python.ops.conditional_transformed_distrib
 from tensorflow.contrib.distributions.python.ops.deterministic import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
+from tensorflow.contrib.distributions.python.ops.estimator import *
 from tensorflow.contrib.distributions.python.ops.geometric import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
@@ -147,6 +148,7 @@ _allowed_symbols = [
     'percentile',
     'assign_exponential_moving_mean_variance',
     'exponential_moving_mean_variance',
+    'estimator_head_distribution_regression',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py
new file mode 100644
index 00000000000..5ff0544c977
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for estimator.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.contrib.distributions.python.ops import estimator as estimator_lib
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators.head_test import _assert_metrics
+from tensorflow.contrib.learn.python.learn.estimators.head_test import _assert_no_variables
+from tensorflow.contrib.learn.python.learn.estimators.head_test import _assert_summary_tags
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class EstimatorHeadDistributionRegressionTest(test.TestCase):
+
+  def _assert_output_alternatives(self, model_fn_ops):
+    self.assertEquals({
+        None: constants.ProblemType.LINEAR_REGRESSION
+    }, {
+        k: v[0] for k, v in six.iteritems(model_fn_ops.output_alternatives)
+    })
+
+  def testNormalLocScaleLogits(self):
+    # We will bias logits[..., 1] so that: logits[..., 1]=0 implies scale=1.
+    scale_bias = np.log(np.expm1(1.))
+
+    def softplus(x):
+      return np.log1p(np.exp(x))
+
+    def actual_loss(logits, labels):
+      mu = actual_mean(logits)
+      sigma = actual_stddev(logits)
+      labels = np.squeeze(labels, -1)
+      z = (labels - mu) / sigma
+      loss = 0.5 * (z**2. + np.log(2. * np.pi)) + np.log(sigma)
+      return loss.mean()
+
+    def actual_mean(logits):
+      return logits[..., 0]
+
+    def actual_stddev(logits):
+      return softplus(logits[..., 1] + scale_bias)
+
+    def make_distribution_fn(logits):
+      return normal_lib.Normal(
+          loc=logits[..., 0],
+          scale=nn_ops.softplus(logits[..., 1] + scale_bias))
+
+    head = estimator_lib.estimator_head_distribution_regression(
+        make_distribution_fn,
+        logits_dimension=2)
+    labels = np.float32([[-1.],
+                         [0.],
+                         [1.]])
+    logits = np.float32([[0., -1],
+                         [1, 0.5],
+                         [-1, 1]])
+    with ops.Graph().as_default(), session.Session():
+      # Convert to tensor so we can index into head.distributions.
+      tflogits = ops.convert_to_tensor(logits, name="logits")
+      model_fn_ops = head.create_model_fn_ops(
+          {},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=tflogits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_summary_tags(self, ["loss"])
+      _assert_no_variables(self)
+      loss = actual_loss(logits, labels)
+      _assert_metrics(self, loss, {"loss": loss}, model_fn_ops)
+
+      # Now we verify the underlying distribution was correctly constructed.
+      expected_mean = logits[..., 0]
+      self.assertAllClose(
+          expected_mean,
+          head.distribution(tflogits).mean().eval(),
+          rtol=1e-6, atol=0.)
+
+      expected_stddev = softplus(logits[..., 1] + scale_bias)
+      self.assertAllClose(
+          expected_stddev,
+          head.distribution(tflogits).stddev().eval(),
+          rtol=1e-6, atol=0.)
+      # Should have created only one distribution.
+      self.assertEqual(1, len(head.distributions))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/estimator.py b/tensorflow/contrib/distributions/python/ops/estimator.py
new file mode 100644
index 00000000000..6b53338c454
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/estimator.py
@@ -0,0 +1,185 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to bridge `Distribution`s and `tf.contrib.learn.estimator` APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.estimators.head import _compute_weighted_loss
+from tensorflow.contrib.learn.python.learn.estimators.head import _RegressionHead
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+
+
+__all__ = [
+    "estimator_head_distribution_regression",
+]
+
+
+def estimator_head_distribution_regression(make_distribution_fn,
+                                           label_dimension=1,
+                                           logits_dimension=None,
+                                           label_name=None,
+                                           weight_column_name=None,
+                                           enable_centered_bias=False,
+                                           head_name=None):
+  """Creates a `Head` for regression under a generic distribution.
+
+  Args:
+    make_distribution_fn: Python `callable` which returns a `tf.Distribution`
+      instance created using only logits.
+    label_dimension: Number of regression labels per example. This is the size
+      of the last dimension of the labels `Tensor` (typically, this has shape
+      `[batch_size, label_dimension]`).
+    logits_dimension: Number of logits per example. This is the size of the last
+      dimension of the logits `Tensor` (typically, this has shape
+      `[batch_size, logits_dimension]`).
+      Default value: `label_dimension`.
+    label_name: Python `str`, name of the key in label `dict`. Can be `None` if
+      label is a `Tensor` (single headed models).
+    weight_column_name: Python `str` defining feature column name representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    enable_centered_bias: Python `bool`. If `True`, estimator will learn a
+      centered bias variable for each class. Rest of the model structure learns
+      the residual after centered bias.
+    head_name: Python `str`, name of the head. Predictions, summary and metrics
+      keys are suffixed by `"/" + head_name` and the default variable scope is
+      `head_name`.
+
+  Returns:
+    An instance of `Head` for generic regression.
+  """
+  return _DistributionRegressionHead(
+      make_distribution_fn=make_distribution_fn,
+      label_dimension=label_dimension,
+      logits_dimension=logits_dimension,
+      label_name=label_name,
+      weight_column_name=weight_column_name,
+      enable_centered_bias=enable_centered_bias,
+      head_name=head_name)
+
+
+class _DistributionRegressionHead(_RegressionHead):
+  """Creates a _RegressionHead instance from an arbitray `Distribution`."""
+
+  def __init__(self,
+               make_distribution_fn,
+               label_dimension,
+               logits_dimension=None,
+               label_name=None,
+               weight_column_name=None,
+               enable_centered_bias=False,
+               head_name=None):
+    """`Head` for regression.
+
+    Args:
+      make_distribution_fn: Python `callable` which returns a `tf.Distribution`
+        instance created using only logits.
+      label_dimension: Number of regression labels per example. This is the
+        size of the last dimension of the labels `Tensor` (typically, this has
+        shape `[batch_size, label_dimension]`).
+      logits_dimension: Number of logits per example. This is the size of the
+        last dimension of the logits `Tensor` (typically, this has shape
+        `[batch_size, logits_dimension]`).
+        Default value: `label_dimension`.
+      label_name: Python `str`, name of the key in label `dict`. Can be `None`
+        if label is a tensor (single headed models).
+      weight_column_name: Python `str` defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      enable_centered_bias: Python `bool`. If `True`, estimator will learn a
+        centered bias variable for each class. Rest of the model structure
+        learns the residual after centered bias.
+      head_name: Python `str`, name of the head. Predictions, summary and
+        metrics keys are suffixed by `"/" + head_name` and the default variable
+        scope is `head_name`.
+
+    Raises:
+      TypeError: if `make_distribution_fn` is not `callable`.
+    """
+    if not callable(make_distribution_fn):
+      raise TypeError("`make_distribution_fn` must be a callable function.")
+
+    self._distributions = {}
+    self._make_distribution_fn = make_distribution_fn
+
+    def static_value(x):
+      """Returns the static value of a `Tensor` or `None`."""
+      return tensor_util.constant_value(ops.convert_to_tensor(x))
+
+    def concat_vectors(*args):
+      """Concatenates input vectors, statically if possible."""
+      args_ = [static_value(x) for x in args]
+      if any(vec is None for vec in args_):
+        return array_ops.concat(args, axis=0)
+      return [val for vec in args_ for val in vec]
+
+    def loss_fn(labels, logits, weights=None):
+      """Returns the loss of using `logits` to predict `labels`."""
+      d = self.distribution(logits)
+      labels_batch_shape = labels.shape.with_rank_at_least(1)[:-1]
+      labels_batch_shape = (
+          labels_batch_shape.as_list() if labels_batch_shape.is_fully_defined()
+          else array_ops.shape(labels)[:-1])
+      labels = array_ops.reshape(
+          labels,
+          shape=concat_vectors(labels_batch_shape, d.event_shape_tensor()))
+      return _compute_weighted_loss(
+          loss_unweighted=-d.log_prob(labels),
+          weight=weights)
+
+    def link_fn(logits):
+      """Returns the inverse link function at `logits`."""
+      # Note: What the API calls a "link function" is really the inverse-link
+      # function, i.e., the "mean".
+      d = self.distribution(logits)
+      return d.mean()
+
+    super(_DistributionRegressionHead, self).__init__(
+        label_dimension=label_dimension,
+        loss_fn=loss_fn,
+        link_fn=link_fn,
+        logits_dimension=logits_dimension,
+        label_name=label_name,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=enable_centered_bias,
+        head_name=head_name)
+
+  @property
+  def distributions(self):
+    """Returns all distributions created by `DistributionRegressionHead`."""
+    return self._distributions
+
+  def distribution(self, logits, name=None):
+    """Retrieves a distribution instance, parameterized by `logits`.
+
+    Args:
+      logits: `float`-like `Tensor` representing the parameters of the
+        underlying distribution.
+      name: The Python `str` name to given to this op.
+        Default value: "distribution".
+
+    Returns:
+      distribution: `tf.Distribution` instance parameterized by `logits`.
+    """
+    with ops.name_scope(name, "distribution", [logits]):
+      d = self._distributions.get(logits, None)
+      if d is None:
+        d = self._make_distribution_fn(logits)
+        self._distributions[logits] = d
+      return d
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index b25250d3671..50c7ba418be 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops.bijectors import AffineLinearOperator
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
@@ -189,7 +189,7 @@ class MultivariateNormalLinearOperator(
         distribution=normal.Normal(
             loc=array_ops.zeros([], dtype=scale.dtype),
             scale=array_ops.ones([], dtype=scale.dtype)),
-        bijector=bijectors.AffineLinearOperator(
+        bijector=AffineLinearOperator(
             shift=loc, scale=scale, validate_args=validate_args),
         batch_shape=batch_shape,
         event_shape=event_shape,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 699a92d38a4..7b49cd475d0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -665,6 +665,7 @@ class _RegressionHead(_SingleHead):
                label_dimension,
                loss_fn,
                link_fn,
+               logits_dimension=None,
                label_name=None,
                weight_column_name=None,
                enable_centered_bias=False,
@@ -677,6 +678,10 @@ class _RegressionHead(_SingleHead):
         shape `[batch_size, label_dimension]`).
       loss_fn: Loss function, takes logits and labels and returns loss.
       link_fn: Link function, takes a logits tensor and returns the output.
+      logits_dimension: Number of logits per example. This is the
+        size of the last dimension of the logits `Tensor` (typically, this has
+        shape `[batch_size, label_dimension]`).
+        Default value: `label_dimension`.
       label_name: String, name of the key in label dict. Can be null if label
           is a tensor (single headed models).
       weight_column_name: A string defining feature column name representing
@@ -691,7 +696,8 @@ class _RegressionHead(_SingleHead):
     """
     super(_RegressionHead, self).__init__(
         problem_type=constants.ProblemType.LINEAR_REGRESSION,
-        logits_dimension=label_dimension,
+        logits_dimension=(logits_dimension if logits_dimension is not None
+                          else label_dimension),
         label_name=label_name,
         weight_column_name=weight_column_name,
         head_name=head_name)